File.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. // Copyright 2017 Patrick Brosi
  2. // info@patrickbrosi.de
  3. #include <fcntl.h>
  4. #include <sys/stat.h>
  5. #include <sys/types.h>
  6. #include <unistd.h>
  7. #include <cassert>
  8. #include <cstring>
  9. #include <fstream>
  10. #include <iostream>
  11. #include <map>
  12. #include "xml/File.h"
  13. #include "xml/NamedEnts.h"
  14. using namespace xml;
  15. // _____________________________________________________________________________
  16. File::File(const std::string& path)
  17. : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
  18. _buffer = new char*[2];
  19. _buffer[0] = new char[BUFFER_S + 1];
  20. _buffer[1] = new char[BUFFER_S + 1];
  21. reset();
  22. }
  23. // _____________________________________________________________________________
  24. File::~File() {
  25. delete[] _buffer[0];
  26. delete[] _buffer[1];
  27. delete[] _buffer;
  28. close(_file);
  29. }
  30. // _____________________________________________________________________________
  31. void File::reset() {
  32. _which = 0;
  33. _s.s = NONE;
  34. _s.hanging = 0;
  35. _totReadBef = 0;
  36. if (_file) close(_file);
  37. _file = open(_path.c_str(), O_RDONLY);
  38. if (_file < 0)
  39. throw XmlFileException(std::string("could not open file"), _path, 0, 0, 0);
  40. #ifdef __unix__
  41. posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
  42. #endif
  43. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  44. _lastNewData = _lastBytes;
  45. _c = _buffer[_which];
  46. while (!_s.tagStack.empty()) _s.tagStack.pop();
  47. _s.tagStack.push("[root]");
  48. _prevs = _s;
  49. }
  50. // _____________________________________________________________________________
  51. size_t File::level() const { return _s.tagStack.size() - _s.hanging; }
  52. // _____________________________________________________________________________
  53. ParserState File::state() { return _prevs; }
  54. // _____________________________________________________________________________
  55. void File::setState(const ParserState& s) {
  56. _s = s;
  57. _prevs = s;
  58. lseek(_file, _s.off, SEEK_SET);
  59. _totReadBef = _s.off;
  60. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  61. _lastNewData = _lastBytes;
  62. _c = _buffer[_which];
  63. next();
  64. }
  65. // _____________________________________________________________________________
  66. const Tag& File::get() const { return _ret; }
  67. // _____________________________________________________________________________
  68. bool File::next() {
  69. if (!_s.tagStack.size()) return false;
  70. // avoid too much stack copying
  71. if (_prevs.tagStack.size() != _s.tagStack.size() ||
  72. _prevs.tagStack.top() != _s.tagStack.top()) {
  73. _prevs.tagStack = _s.tagStack;
  74. }
  75. _prevs.s = _s.s;
  76. _prevs.hanging = _s.hanging;
  77. _prevs.off =
  78. _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
  79. if (_s.hanging) _s.hanging--;
  80. _ret.name = 0;
  81. _ret.text = emptyStr;
  82. _ret.attrs.clear();
  83. void* i;
  84. while (_lastBytes) {
  85. for (; _c - _buffer[_which] < _lastBytes; ++_c) {
  86. char c = *_c;
  87. switch (_s.s) {
  88. case NONE:
  89. if (std::isspace(c))
  90. continue;
  91. else if (c == '<') {
  92. _s.s = IN_TAG_TENTATIVE;
  93. continue;
  94. } else {
  95. _s.s = IN_TEXT;
  96. _ret.name = emptyStr;
  97. _tmp = _c;
  98. continue;
  99. }
  100. case IN_TEXT:
  101. i = memchr(_c, '<', _lastBytes - (_c - _buffer[_which]));
  102. if (!i) {
  103. _c = _buffer[_which] + _lastBytes;
  104. continue;
  105. } else {
  106. _c = (char*)i;
  107. *_c = 0;
  108. _ret.text = _tmp;
  109. _s.s = IN_TAG_TENTATIVE;
  110. _c++;
  111. return true;
  112. }
  113. case IN_COMMENT_TENTATIVE:
  114. if (c == '-') {
  115. _s.s = IN_COMMENT_TENTATIVE2;
  116. continue;
  117. }
  118. throw XmlFileException("Expected comment", _path, _c, _buffer[_which],
  119. _prevs.off);
  120. case IN_COMMENT_TENTATIVE2:
  121. if (c == '-') {
  122. _s.s = IN_COMMENT;
  123. continue;
  124. }
  125. throw XmlFileException("Expected comment", _path, _c, _buffer[_which],
  126. _prevs.off);
  127. case IN_COMMENT_CL_TENTATIVE:
  128. if (c == '-') {
  129. _s.s = IN_COMMENT_CL_TENTATIVE2;
  130. continue;
  131. }
  132. _s.s = IN_COMMENT;
  133. continue;
  134. case IN_COMMENT_CL_TENTATIVE2:
  135. if (c == '>') {
  136. _s.s = NONE;
  137. continue;
  138. }
  139. _s.s = IN_COMMENT;
  140. // fall through, we are still in comment
  141. case IN_COMMENT:
  142. i = memchr(_c, '-', _lastBytes - (_c - _buffer[_which]));
  143. if (!i) {
  144. _c = _buffer[_which] + _lastBytes;
  145. continue;
  146. } else {
  147. _c = (char*)i;
  148. _s.s = IN_COMMENT_CL_TENTATIVE;
  149. continue;
  150. }
  151. case IN_TAG_TENTATIVE:
  152. if (c == '/') {
  153. _s.s = IN_TAG_NAME_CLOSE;
  154. _tmp = _c + 1;
  155. continue;
  156. } else if (c == '?') {
  157. _s.s = IN_TAG_NAME_META;
  158. continue;
  159. } else if (c == '!') {
  160. _s.s = IN_COMMENT_TENTATIVE;
  161. continue;
  162. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  163. _s.s = IN_TAG_NAME;
  164. _ret.name = _c;
  165. continue;
  166. }
  167. case IN_TAG:
  168. if (std::isspace(c))
  169. continue;
  170. else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  171. _s.s = IN_ATTRKEY;
  172. _tmp = _c;
  173. continue;
  174. } else if (c == '/') {
  175. _s.s = AW_CLOSING;
  176. continue;
  177. } else if (c == '>') {
  178. _s.hanging++;
  179. _s.tagStack.push(_ret.name);
  180. _s.s = WS_SKIP;
  181. continue;
  182. } else {
  183. throw XmlFileException("Expected valid tag", _path, _c,
  184. _buffer[_which], _prevs.off);
  185. }
  186. case IN_ATTRVAL_SQ:
  187. i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
  188. if (!i) {
  189. _c = _buffer[_which] + _lastBytes;
  190. continue;
  191. } else {
  192. _c = (char*)i;
  193. _s.s = IN_TAG;
  194. *_c = 0;
  195. _ret.attrs[_tmp] = _tmp2;
  196. continue;
  197. }
  198. case IN_ATTRVAL_DQ:
  199. i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
  200. if (!i) {
  201. _c = _buffer[_which] + _lastBytes;
  202. continue;
  203. } else {
  204. _c = (char*)i;
  205. _s.s = IN_TAG;
  206. *_c = 0;
  207. _ret.attrs[_tmp] = _tmp2;
  208. continue;
  209. }
  210. case AW_IN_ATTRVAL:
  211. if (std::isspace(c))
  212. continue;
  213. else if (c == '\'') {
  214. _s.s = IN_ATTRVAL_SQ;
  215. _tmp2 = _c + 1;
  216. continue;
  217. } else if (c == '"') {
  218. _s.s = IN_ATTRVAL_DQ;
  219. _tmp2 = _c + 1;
  220. continue;
  221. } else {
  222. throw XmlFileException("Expected attribute value", _path, _c,
  223. _buffer[_which], _prevs.off);
  224. }
  225. case IN_ATTRKEY:
  226. if (std::isspace(c)) {
  227. *_c = 0;
  228. _s.s = AFTER_ATTRKEY;
  229. continue;
  230. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  231. continue;
  232. } else if (c == '=') {
  233. *_c = 0;
  234. _s.s = AW_IN_ATTRVAL;
  235. continue;
  236. }
  237. throw XmlFileException("Expected attribute key char or =", _path, _c,
  238. _buffer[_which], _prevs.off);
  239. case AFTER_ATTRKEY:
  240. if (std::isspace(c))
  241. continue;
  242. else if (c == '=') {
  243. _s.s = AW_IN_ATTRVAL;
  244. continue;
  245. } else {
  246. throw XmlFileException(
  247. std::string("Expected attribute value for '") + _tmp + "'.",
  248. _path, _c, _buffer[_which], _prevs.off);
  249. }
  250. case IN_TAG_NAME:
  251. if (std::isspace(c)) {
  252. *_c = 0;
  253. _s.s = IN_TAG;
  254. continue;
  255. } else if (c == '>') {
  256. *_c = 0;
  257. _s.hanging++;
  258. _s.tagStack.push(_ret.name);
  259. _s.s = WS_SKIP;
  260. continue;
  261. } else if (c == '/') {
  262. *_c = 0;
  263. _s.s = AW_CLOSING;
  264. continue;
  265. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  266. continue;
  267. }
  268. case IN_TAG_NAME_META:
  269. // TODO: read meta tags!
  270. if (c == '>') {
  271. _s.s = NONE;
  272. continue;
  273. }
  274. continue;
  275. case IN_TAG_NAME_CLOSE:
  276. if (std::isspace(c)) {
  277. *_c = 0;
  278. _s.s = IN_TAG_CLOSE;
  279. continue;
  280. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  281. continue;
  282. } else if (c == '>') {
  283. *_c = 0;
  284. if (_tmp != _s.tagStack.top()) {
  285. throw XmlFileException(std::string("Closing wrong tag '<") + _tmp +
  286. ">', expected close of '<" +
  287. _s.tagStack.top() + ">'.",
  288. _path, _c, _buffer[_which], _prevs.off);
  289. }
  290. _s.tagStack.pop();
  291. _s.s = NONE;
  292. continue;
  293. }
  294. case IN_TAG_CLOSE:
  295. if (std::isspace(c))
  296. continue;
  297. else if (c == '>') {
  298. if (_tmp != _s.tagStack.top()) {
  299. throw XmlFileException(std::string("Closing wrong tag '<") + _tmp +
  300. ">', expected close of '<" +
  301. _s.tagStack.top() + ">'.",
  302. _path, _c, _buffer[_which], _prevs.off);
  303. }
  304. _s.tagStack.pop();
  305. _s.s = NONE;
  306. continue;
  307. } else {
  308. throw XmlFileException("Expected '>'", _path, _c, _buffer[_which],
  309. _prevs.off);
  310. }
  311. case AW_CLOSING:
  312. if (c == '>') {
  313. _s.s = WS_SKIP;
  314. continue;
  315. }
  316. case WS_SKIP:
  317. if (std::isspace(c))
  318. continue;
  319. else {
  320. _s.s = NONE;
  321. return true;
  322. }
  323. }
  324. }
  325. // buffer ended, read new stuff, but copy remaining if needed
  326. size_t off = 0;
  327. if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) {
  328. off = _lastBytes - (_ret.name - _buffer[_which]);
  329. memmove(_buffer[!_which], _ret.name, off);
  330. _ret.name = _buffer[!_which];
  331. } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
  332. _s.s == IN_TEXT) {
  333. off = _lastBytes - (_tmp - _buffer[_which]);
  334. memmove(_buffer[!_which], _tmp, off);
  335. _tmp = _buffer[!_which];
  336. } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
  337. off = _lastBytes - (_tmp2 - _buffer[_which]);
  338. memmove(_buffer[!_which], _tmp2, off);
  339. _tmp2 = _buffer[!_which];
  340. }
  341. assert(off <= BUFFER_S);
  342. size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
  343. if (!readb) break;
  344. _totReadBef += _lastNewData;
  345. _which = !_which;
  346. _lastNewData = readb;
  347. _lastBytes = _lastNewData + off;
  348. _c = _buffer[_which] + off;
  349. }
  350. if (_s.tagStack.size()) {
  351. if (_s.tagStack.top() != "[root]") {
  352. throw XmlFileException("XML tree not complete", _path, _c,
  353. _buffer[_which], _prevs.off);
  354. } else {
  355. _s.tagStack.pop();
  356. }
  357. }
  358. _s.s = NONE;
  359. _ret.name = "[root]";
  360. return false;
  361. }
  362. // _____________________________________________________________________________
  363. std::string File::decode(const std::string& str) { return decode(str.c_str()); }
  364. // _____________________________________________________________________________
  365. std::string File::decode(const char* str) {
  366. const char* c = strchr(str, '&');
  367. if (!c) return str;
  368. char* decRet = new char[strlen(str) + 1];
  369. const char* last = str;
  370. char* dstPt = decRet;
  371. for (; c != 0; c = strchr(c + 1, '&')) {
  372. memcpy(dstPt, last, c - last);
  373. dstPt += c - last;
  374. last = c;
  375. if (*(c + 1) == '#') {
  376. uint64_t cp = -1;
  377. char* tail;
  378. errno = 0;
  379. if (*(c + 2) == 'x' || *(c + 2) == 'X')
  380. cp = strtoul(c + 3, &tail, 16);
  381. else
  382. cp = strtoul(c + 2, &tail, 10);
  383. if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
  384. dstPt += utf8(cp, dstPt);
  385. last = tail + 1;
  386. }
  387. } else {
  388. const char* e = strchr(c, ';');
  389. if (e) {
  390. char* ent = new char[e - 1 - c + 1];
  391. memcpy(ent, c + 1, e - 1 - c);
  392. ent[e - 1 - c] = 0;
  393. const auto it = xml::ENTITIES.find(ent);
  394. if (it != xml::ENTITIES.end()) {
  395. const char* utf8 = it->second;
  396. memcpy(dstPt, utf8, strlen(utf8));
  397. dstPt += strlen(utf8);
  398. last += strlen(ent) + 2;
  399. }
  400. delete[] ent;
  401. }
  402. }
  403. }
  404. strcpy(dstPt, last);
  405. std::string ret(decRet);
  406. delete[] decRet;
  407. return ret;
  408. }
  409. // _____________________________________________________________________________
  410. size_t File::utf8(size_t cp, char* out) {
  411. if (cp <= 0x7F) {
  412. out[0] = cp & 0x7F;
  413. return 1;
  414. } else if (cp <= 0x7FF) {
  415. out[0] = 0xC0 | (cp >> 6);
  416. out[1] = 0x80 | (cp & 0x3F);
  417. return 2;
  418. } else if (cp <= 0xFFFF) {
  419. out[0] = 0xE0 | (cp >> 12);
  420. out[1] = 0x80 | ((cp >> 6) & 0x3F);
  421. out[2] = 0x80 | (cp & 0x3F);
  422. return 3;
  423. } else if (cp <= 0x1FFFFF) {
  424. out[0] = 0xF0 | (cp >> 18);
  425. out[1] = 0x80 | ((cp >> 12) & 0x3F);
  426. out[2] = 0x80 | ((cp >> 6) & 0x3F);
  427. out[3] = 0x80 | (cp & 0x3F);
  428. return 4;
  429. }
  430. return 0;
  431. }