File.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. // Copyright 2017 Patrick Brosi
  2. // info@patrickbrosi.de
  3. #include <fcntl.h>
  4. #include <sys/stat.h>
  5. #include <sys/types.h>
  6. #include <unistd.h>
  7. #include <cassert>
  8. #include <cstring>
  9. #include <fstream>
  10. #include <iostream>
  11. #include <map>
  12. #include "xml/File.h"
  13. #include "xml/NamedEnts.h"
  14. #ifndef POSIX_FADV_SEQUENTIAL
  15. #define POSIX_FADV_SEQUENTIAL 2
  16. #endif
  17. using namespace xml;
  18. // _____________________________________________________________________________
  19. File::File(const std::string& path)
  20. : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
  21. _buffer = new char*[2];
  22. _buffer[0] = new char[BUFFER_S + 1];
  23. _buffer[1] = new char[BUFFER_S + 1];
  24. reset();
  25. }
  26. // _____________________________________________________________________________
  27. File::~File() {
  28. delete[] _buffer[0];
  29. delete[] _buffer[1];
  30. delete[] _buffer;
  31. close(_file);
  32. }
  33. // _____________________________________________________________________________
  34. void File::reset() {
  35. _which = 0;
  36. _s.s = NONE;
  37. _s.hanging = 0;
  38. _totReadBef = 0;
  39. if (_file) close(_file);
  40. _file = open(_path.c_str(), O_RDONLY);
  41. if (_file < 0) throw XmlFileException(std::string("could not open ") + _path);
  42. posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
  43. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  44. _lastNewData = _lastBytes;
  45. _c = _buffer[_which];
  46. while (!_s.tagStack.empty()) _s.tagStack.pop();
  47. _s.tagStack.push("[root]");
  48. _prevs = _s;
  49. }
  50. // _____________________________________________________________________________
  51. size_t File::level() const { return _s.tagStack.size() - _s.hanging; }
  52. // _____________________________________________________________________________
  53. ParserState File::state() { return _prevs; }
  54. // _____________________________________________________________________________
  55. void File::setState(const ParserState& s) {
  56. _s = s;
  57. _prevs = s;
  58. lseek(_file, _s.off, SEEK_SET);
  59. _totReadBef = _s.off;
  60. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  61. _lastNewData = _lastBytes;
  62. _c = _buffer[_which];
  63. next();
  64. }
  65. // _____________________________________________________________________________
  66. const Tag& File::get() const { return _ret; }
  67. // _____________________________________________________________________________
  68. bool File::next() {
  69. if (!_s.tagStack.size()) return false;
  70. // avoid too much stack copying
  71. if (_prevs.tagStack.size() != _s.tagStack.size() ||
  72. _prevs.tagStack.top() != _s.tagStack.top()) {
  73. _prevs.tagStack = _s.tagStack;
  74. }
  75. _prevs.s = _s.s;
  76. _prevs.hanging = _s.hanging;
  77. _prevs.off =
  78. _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
  79. if (_s.hanging) _s.hanging--;
  80. _ret.name = 0;
  81. _ret.attrs.clear();
  82. void* i;
  83. while (_lastBytes) {
  84. for (; _c - _buffer[_which] < _lastBytes; ++_c) {
  85. char c = *_c;
  86. switch (_s.s) {
  87. case NONE:
  88. if (std::isspace(c))
  89. continue;
  90. else if (c == '<') {
  91. _s.s = IN_TAG_TENTATIVE;
  92. continue;
  93. } else {
  94. _s.s = IN_TEXT;
  95. continue;
  96. }
  97. case IN_TEXT:
  98. throw XmlFileException("text nodes not yet supported");
  99. case IN_TAG_TENTATIVE:
  100. if (c == '/') {
  101. _s.s = IN_TAG_NAME_CLOSE;
  102. _tmp = _c + 1;
  103. continue;
  104. } else if (c == '?') {
  105. _s.s = IN_TAG_NAME_META;
  106. continue;
  107. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  108. _s.s = IN_TAG_NAME;
  109. _ret.name = _c;
  110. continue;
  111. }
  112. case IN_TAG:
  113. if (std::isspace(c))
  114. continue;
  115. else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  116. _s.s = IN_ATTRKEY;
  117. _tmp = _c;
  118. continue;
  119. } else if (c == '/') {
  120. _s.s = AW_CLOSING;
  121. continue;
  122. } else if (c == '>') {
  123. _s.hanging++;
  124. _s.tagStack.push(_ret.name);
  125. _s.s = WS_SKIP;
  126. continue;
  127. } else {
  128. throw XmlFileException("expected valid tag");
  129. }
  130. case IN_ATTRVAL_SQ:
  131. i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
  132. if (!i) {
  133. _c = _buffer[_which] + _lastBytes;
  134. continue;
  135. } else {
  136. _c = (char*)i;
  137. _s.s = IN_TAG;
  138. *_c = 0;
  139. _ret.attrs[_tmp] = _tmp2;
  140. continue;
  141. }
  142. case IN_ATTRVAL_DQ:
  143. i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
  144. if (!i) {
  145. _c = _buffer[_which] + _lastBytes;
  146. continue;
  147. } else {
  148. _c = (char*)i;
  149. _s.s = IN_TAG;
  150. *_c = 0;
  151. _ret.attrs[_tmp] = _tmp2;
  152. continue;
  153. }
  154. case AW_IN_ATTRVAL:
  155. if (std::isspace(c))
  156. continue;
  157. else if (c == '\'') {
  158. _s.s = IN_ATTRVAL_SQ;
  159. _tmp2 = _c + 1;
  160. continue;
  161. } else if (c == '"') {
  162. _s.s = IN_ATTRVAL_DQ;
  163. _tmp2 = _c + 1;
  164. continue;
  165. } else {
  166. throw XmlFileException("expected attribute value");
  167. }
  168. case IN_ATTRKEY:
  169. if (std::isspace(c)) {
  170. *_c = 0;
  171. _s.s = AFTER_ATTRKEY;
  172. continue;
  173. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  174. continue;
  175. } else if (c == '=') {
  176. *_c = 0;
  177. _s.s = AW_IN_ATTRVAL;
  178. continue;
  179. }
  180. throw XmlFileException("expected attribute key char or =");
  181. case AFTER_ATTRKEY:
  182. if (std::isspace(c))
  183. continue;
  184. else if (c == '=') {
  185. _s.s = AW_IN_ATTRVAL;
  186. continue;
  187. } else {
  188. // TODO: error
  189. continue;
  190. }
  191. case IN_TAG_NAME:
  192. if (std::isspace(c)) {
  193. *_c = 0;
  194. _s.s = IN_TAG;
  195. continue;
  196. } else if (c == '>') {
  197. *_c = 0;
  198. _s.hanging++;
  199. _s.tagStack.push(_ret.name);
  200. _s.s = WS_SKIP;
  201. continue;
  202. } else if (c == '/') {
  203. *_c = 0;
  204. _s.s = AW_CLOSING;
  205. continue;
  206. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  207. continue;
  208. }
  209. case IN_TAG_NAME_META:
  210. // TODO: read meta tags!
  211. if (c == '>') {
  212. _s.s = NONE;
  213. continue;
  214. }
  215. continue;
  216. case IN_TAG_NAME_CLOSE:
  217. if (std::isspace(c)) {
  218. *_c = 0;
  219. _s.s = IN_TAG_CLOSE;
  220. continue;
  221. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  222. continue;
  223. } else if (c == '>') {
  224. *_c = 0;
  225. if (_tmp != _s.tagStack.top()) {
  226. throw XmlFileException("closing wrong tag");
  227. }
  228. _s.tagStack.pop();
  229. _s.s = NONE;
  230. continue;
  231. }
  232. case IN_TAG_CLOSE:
  233. if (std::isspace(c))
  234. continue;
  235. else if (c == '>') {
  236. if (_tmp != _s.tagStack.top()) {
  237. throw XmlFileException("closing wrong tag");
  238. }
  239. _s.tagStack.pop();
  240. _s.s = NONE;
  241. continue;
  242. } else {
  243. throw XmlFileException("expected '>'");
  244. }
  245. case AW_CLOSING:
  246. if (c == '>') {
  247. _s.s = WS_SKIP;
  248. continue;
  249. }
  250. case WS_SKIP:
  251. if (std::isspace(c))
  252. continue;
  253. else {
  254. _s.s = NONE;
  255. return true;
  256. }
  257. }
  258. }
  259. // buffer ended, read new stuff, but copy remaining if needed
  260. size_t off = 0;
  261. if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) {
  262. off = _lastBytes - (_ret.name - _buffer[_which]);
  263. memmove(_buffer[!_which], _ret.name, off);
  264. _ret.name = _buffer[!_which];
  265. } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
  266. _s.s == IN_TEXT) {
  267. off = _lastBytes - (_tmp - _buffer[_which]);
  268. memmove(_buffer[!_which], _tmp, off);
  269. _tmp = _buffer[!_which];
  270. } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
  271. off = _lastBytes - (_tmp2 - _buffer[_which]);
  272. memmove(_buffer[!_which], _tmp2, off);
  273. _tmp2 = _buffer[!_which];
  274. }
  275. assert(off <= BUFFER_S);
  276. size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
  277. if (!readb) break;
  278. _totReadBef += _lastNewData;
  279. _which = !_which;
  280. _lastNewData = readb;
  281. _lastBytes = _lastNewData + off;
  282. _c = _buffer[_which] + off;
  283. }
  284. if (_s.tagStack.size()) {
  285. if (_s.tagStack.top() != "[root]") {
  286. // TODO error
  287. throw XmlFileException("XML tree not complete");
  288. } else {
  289. _s.tagStack.pop();
  290. }
  291. }
  292. _s.s = NONE;
  293. _ret.name = "[root]";
  294. return false;
  295. }
  296. // _____________________________________________________________________________
  297. std::string File::decode(const std::string& str) { return decode(str.c_str()); }
  298. // _____________________________________________________________________________
  299. std::string File::decode(const char* str) {
  300. const char* c = strchr(str, '&');
  301. if (!c) return str;
  302. char* decRet = new char[strlen(str) + 1];
  303. const char* last = str;
  304. char* dstPt = decRet;
  305. for (; c != 0; c = strchr(c + 1, '&')) {
  306. memcpy(dstPt, last, c - last);
  307. dstPt += c - last;
  308. last = c;
  309. if (*(c + 1) == '#') {
  310. uint64_t cp = -1;
  311. char* tail;
  312. errno = 0;
  313. if (*(c + 2) == 'x' || *(c + 2) == 'X')
  314. cp = strtoul(c + 3, &tail, 16);
  315. else
  316. cp = strtoul(c + 2, &tail, 10);
  317. if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
  318. dstPt += utf8(cp, dstPt);
  319. last = tail + 1;
  320. }
  321. } else {
  322. const char* e = strchr(c, ';');
  323. if (e) {
  324. char* ent = new char[e - 1 - c + 1];
  325. memcpy(ent, c + 1, e - 1 - c);
  326. ent[e - 1 - c] = 0;
  327. const auto it = xml::ENTITIES.find(ent);
  328. if (it != xml::ENTITIES.end()) {
  329. const char* utf8 = it->second;
  330. memcpy(dstPt, utf8, strlen(utf8));
  331. dstPt += strlen(utf8);
  332. last += strlen(ent) + 2;
  333. }
  334. delete[] ent;
  335. }
  336. }
  337. }
  338. strcpy(dstPt, last);
  339. std::string ret(decRet);
  340. delete[] decRet;
  341. return ret;
  342. }
  343. // _____________________________________________________________________________
  344. size_t File::utf8(size_t cp, char* out) {
  345. if (cp <= 0x7F) {
  346. out[0] = cp & 0x7F;
  347. return 1;
  348. } else if (cp <= 0x7FF) {
  349. out[0] = 0xC0 | (cp >> 6);
  350. out[1] = 0x80 | (cp & 0x3F);
  351. return 2;
  352. } else if (cp <= 0xFFFF) {
  353. out[0] = 0xE0 | (cp >> 12);
  354. out[1] = 0x80 | ((cp >> 6) & 0x3F);
  355. out[2] = 0x80 | (cp & 0x3F);
  356. return 3;
  357. } else if (cp <= 0x1FFFFF) {
  358. out[0] = 0xF0 | (cp >> 18);
  359. out[1] = 0x80 | ((cp >> 12) & 0x3F);
  360. out[2] = 0x80 | ((cp >> 6) & 0x3F);
  361. out[3] = 0x80 | (cp & 0x3F);
  362. return 4;
  363. }
  364. return 0;
  365. }