pfxml.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830
  1. // Copyright 2017 Patrick Brosi
  2. // info@patrickbrosi.de
  3. #ifndef PFXML_H_
  4. #define PFXML_H_
  5. #include <cstring>
  6. #include <fstream>
  7. #include <map>
  8. #include <sstream>
  9. #include <stack>
  10. #include <string>
  11. namespace pfxml {
  12. static const size_t BUFFER_S = 16 * 1024;
  13. enum state {
  14. NONE,
  15. IN_TAG_NAME,
  16. IN_TAG_NAME_META,
  17. IN_TAG,
  18. IN_TAG_CLOSE,
  19. IN_TAG_NAME_CLOSE,
  20. IN_TAG_TENTATIVE,
  21. IN_ATTRKEY,
  22. AFTER_ATTRKEY,
  23. AW_IN_ATTRVAL,
  24. IN_ATTRVAL_SQ,
  25. IN_ATTRVAL_DQ,
  26. IN_TEXT,
  27. IN_COMMENT_TENTATIVE,
  28. IN_COMMENT_TENTATIVE2,
  29. IN_COMMENT,
  30. IN_COMMENT_CL_TENTATIVE,
  31. IN_COMMENT_CL_TENTATIVE2,
  32. AW_CLOSING,
  33. WS_SKIP
  34. };
  35. // see
  36. // http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
  37. static const std::map<std::string, const char*> ENTITIES = {
  38. {"aacute", "á"},
  39. {"Aacute", "Á"},
  40. {"acirc", "â"},
  41. {"Acirc", "Â"},
  42. {"acute", "´"},
  43. {"aelig", "æ"},
  44. {"AElig", "Æ"},
  45. {"agrave", "à"},
  46. {"Agrave", "À"},
  47. {"alefsym", "ℵ"},
  48. {"alpha", "α"},
  49. {"Alpha", "Α"},
  50. {"amp", "&"},
  51. {"and", "∧"},
  52. {"ang", "∠"},
  53. {"apos", "'"},
  54. {"aring", "å"},
  55. {"Aring", "Å"},
  56. {"asymp", "≈"},
  57. {"atilde", "ã"},
  58. {"Atilde", "Ã"},
  59. {"auml", "ä"},
  60. {"Auml", "Ä"},
  61. {"bdquo", "„"},
  62. {"beta", "β"},
  63. {"Beta", "Β"},
  64. {"brvbar", "¦"},
  65. {"bull", "•"},
  66. {"cap", "∩"},
  67. {"ccedil", "ç"},
  68. {"Ccedil", "Ç"},
  69. {"cedil", "¸"},
  70. {"cent", "¢"},
  71. {"chi", "χ"},
  72. {"Chi", "Χ"},
  73. {"circ", "ˆ"},
  74. {"clubs", "♣"},
  75. {"cong", "≅"},
  76. {"copy", "©"},
  77. {"crarr", "↵"},
  78. {"cup", "∪"},
  79. {"curren", "¤"},
  80. {"dagger", "†"},
  81. {"Dagger", "‡"},
  82. {"darr", "↓"},
  83. {"dArr", "⇓"},
  84. {"deg", "°"},
  85. {"delta", "δ"},
  86. {"Delta", "Δ"},
  87. {"diams", "♦"},
  88. {"divide", "÷"},
  89. {"eacute", "é"},
  90. {"Eacute", "É"},
  91. {"ecirc", "ê"},
  92. {"Ecirc", "Ê"},
  93. {"egrave", "è"},
  94. {"Egrave", "È"},
  95. {"empty", "∅"},
  96. {"emsp", "\xE2\x80\x83"},
  97. {"ensp", "\xE2\x80\x82"},
  98. {"epsilon", "ε"},
  99. {"Epsilon", "Ε"},
  100. {"equiv", "≡"},
  101. {"eta", "η"},
  102. {"Eta", "Η"},
  103. {"eth", "ð"},
  104. {"ETH", "Ð"},
  105. {"euml", "ë"},
  106. {"Euml", "Ë"},
  107. {"euro", "€"},
  108. {"exist", "∃"},
  109. {"fnof", "ƒ"},
  110. {"forall", "∀"},
  111. {"frac12", "½"},
  112. {"frac14", "¼"},
  113. {"frac34", "¾"},
  114. {"frasl", "⁄"},
  115. {"gamma", "γ"},
  116. {"Gamma", "Γ"},
  117. {"ge", "≥"},
  118. {"gt", ">"},
  119. {"harr", "↔"},
  120. {"hArr", "⇔"},
  121. {"hearts", "♥"},
  122. {"hellip", "…"},
  123. {"iacute", "í"},
  124. {"Iacute", "Í"},
  125. {"icirc", "î"},
  126. {"Icirc", "Î"},
  127. {"iexcl", "¡"},
  128. {"igrave", "ì"},
  129. {"Igrave", "Ì"},
  130. {"image", "ℑ"},
  131. {"infin", "∞"},
  132. {"int", "∫"},
  133. {"iota", "ι"},
  134. {"Iota", "Ι"},
  135. {"iquest", "¿"},
  136. {"isin", "∈"},
  137. {"iuml", "ï"},
  138. {"Iuml", "Ï"},
  139. {"kappa", "κ"},
  140. {"Kappa", "Κ"},
  141. {"lambda", "λ"},
  142. {"Lambda", "Λ"},
  143. {"lang", "〈"},
  144. {"laquo", "«"},
  145. {"larr", "←"},
  146. {"lArr", "⇐"},
  147. {"lceil", "⌈"},
  148. {"ldquo", "“"},
  149. {"le", "≤"},
  150. {"lfloor", "⌊"},
  151. {"lowast", "∗"},
  152. {"loz", "◊"},
  153. {"lrm", "\xE2\x80\x8E"},
  154. {"lsaquo", "‹"},
  155. {"lsquo", "‘"},
  156. {"lt", "<"},
  157. {"macr", "¯"},
  158. {"mdash", "—"},
  159. {"micro", "µ"},
  160. {"middot", "·"},
  161. {"minus", "−"},
  162. {"mu", "μ"},
  163. {"Mu", "Μ"},
  164. {"nabla", "∇"},
  165. {"nbsp", "\xC2\xA0"},
  166. {"ndash", "–"},
  167. {"ne", "≠"},
  168. {"ni", "∋"},
  169. {"not", "¬"},
  170. {"notin", "∉"},
  171. {"nsub", "⊄"},
  172. {"ntilde", "ñ"},
  173. {"Ntilde", "Ñ"},
  174. {"nu", "ν"},
  175. {"Nu", "Ν"},
  176. {"oacute", "ó"},
  177. {"Oacute", "Ó"},
  178. {"ocirc", "ô"},
  179. {"Ocirc", "Ô"},
  180. {"oelig", "œ"},
  181. {"OElig", "Œ"},
  182. {"ograve", "ò"},
  183. {"Ograve", "Ò"},
  184. {"oline", "‾"},
  185. {"omega", "ω"},
  186. {"Omega", "Ω"},
  187. {"omicron", "ο"},
  188. {"Omicron", "Ο"},
  189. {"oplus", "⊕"},
  190. {"or", "∨"},
  191. {"ordf", "ª"},
  192. {"ordm", "º"},
  193. {"oslash", "ø"},
  194. {"Oslash", "Ø"},
  195. {"otilde", "õ"},
  196. {"Otilde", "Õ"},
  197. {"otimes", "⊗"},
  198. {"ouml", "ö"},
  199. {"Ouml", "Ö"},
  200. {"para", "¶"},
  201. {"part", "∂"},
  202. {"permil", "‰"},
  203. {"perp", "⊥"},
  204. {"phi", "φ"},
  205. {"Phi", "Φ"},
  206. {"piv", "ϖ"},
  207. {"pi", "π"},
  208. {"Pi", "Π"},
  209. {"plusmn", "±"},
  210. {"pound", "£"},
  211. {"prime", "′"},
  212. {"Prime", "″"},
  213. {"prod", "∏"},
  214. {"prop", "∝"},
  215. {"psi", "ψ"},
  216. {"Psi", "Ψ"},
  217. {"quot", "\""},
  218. {"radic", "√"},
  219. {"rang", "〉"},
  220. {"raquo", "»"},
  221. {"rarr", "→"},
  222. {"rArr", "⇒"},
  223. {"rceil", "⌉"},
  224. {"rdquo", "”"},
  225. {"real", "ℜ"},
  226. {"reg", "®"},
  227. {"rfloor", "⌋"},
  228. {"rho", "ρ"},
  229. {"Rho", "Ρ"},
  230. {"rlm", "\xE2\x80\x8F"},
  231. {"rsaquo", "›"},
  232. {"rsquo", "’"},
  233. {"sbquo", "‚"},
  234. {"scaron", "š"},
  235. {"Scaron", "Š"},
  236. {"sdot", "⋅"},
  237. {"sect", "§"},
  238. {"shy", "\xC2\xAD"},
  239. {"sigmaf", "ς"},
  240. {"sigma", "σ"},
  241. {"Sigma", "Σ"},
  242. {"sim", "∼"},
  243. {"spades", "♠"},
  244. {"sub", "⊂"},
  245. {"sube", "⊆"},
  246. {"sum", "∑"},
  247. {"sup", "⊃"},
  248. {"sup1", "¹"},
  249. {"sup2", "²"},
  250. {"sup3", "³"},
  251. {"supe", "⊇"},
  252. {"szlig", "ß"},
  253. {"tau", "τ"},
  254. {"Tau", "Τ"},
  255. {"there4", "∴"},
  256. {"thetasym", "ϑ"},
  257. {"theta", "θ"},
  258. {"Theta", "Θ"},
  259. {"thinsp", "\xE2\x80\x89"},
  260. {"thorn", "þ"},
  261. {"THORN", "Þ"},
  262. {"tilde", "˜"},
  263. {"times", "×"},
  264. {"trade", "™"},
  265. {"uacute", "ú"},
  266. {"Uacute", "Ú"},
  267. {"uarr", "↑"},
  268. {"uArr", "⇑"},
  269. {"ucirc", "û"},
  270. {"Ucirc", "Û"},
  271. {"ugrave", "ù"},
  272. {"Ugrave", "Ù"},
  273. {"uml", "¨"},
  274. {"upsih", "ϒ"},
  275. {"upsilon", "υ"},
  276. {"Upsilon", "Υ"},
  277. {"uuml", "ü"},
  278. {"Uuml", "Ü"},
  279. {"weierp", "℘"},
  280. {"xi", "ξ"},
  281. {"Xi", "Ξ"},
  282. {"yacute", "ý"},
  283. {"Yacute", "Ý"},
  284. {"yen", "¥"},
  285. {"yuml", "ÿ"},
  286. {"Yuml", "Ÿ"},
  287. {"zeta", "ζ"},
  288. {"Zeta", "Ζ"},
  289. {"zwj", "\xE2\x80\x8D"},
  290. {"zwnj", "\xE2\x80\x8C"}};
  291. class parse_exc : public std::exception {
  292. public:
  293. parse_exc(std::string msg, std::string file, const char* p, char* buff,
  294. size_t offset) {
  295. std::stringstream ss;
  296. ss << file << " at position " << (offset + (p - buff)) << ": " << msg;
  297. _msg = ss.str();
  298. }
  299. ~parse_exc() throw() {}
  300. virtual const char* what() const throw() { return _msg.c_str(); }
  301. private:
  302. std::string _msg;
  303. };
  304. struct attr_cmp {
  305. bool operator()(const char* const& a, const char* const& b) const {
  306. return std::strcmp(a, b) < 0;
  307. }
  308. };
  309. struct parser_state {
  310. parser_state() : s(NONE), hanging(0), off(0) {}
  311. std::stack<std::string> tagStack;
  312. state s;
  313. size_t hanging;
  314. int64_t off;
  315. };
  316. typedef std::map<const char*, const char*, attr_cmp> AttrMap;
  317. struct tag {
  318. const char* name;
  319. const char* text;
  320. AttrMap attrs;
  321. };
  322. class file {
  323. public:
  324. file(const std::string& path);
  325. ~file();
  326. const tag& get() const;
  327. bool next();
  328. size_t level() const;
  329. void reset();
  330. parser_state state();
  331. void set_state(const parser_state& s);
  332. static std::string decode(const char* str);
  333. static std::string decode(const std::string& str);
  334. private:
  335. int _file;
  336. parser_state _s;
  337. parser_state _prevs;
  338. char** _buffer;
  339. char* _c;
  340. int64_t _lastBytes;
  341. const char* _tmp;
  342. const char* _tmp2;
  343. size_t _which;
  344. std::string _path;
  345. int64_t _totReadBef;
  346. int64_t _lastNewData;
  347. tag _ret;
  348. static size_t utf8(size_t cp, char* out);
  349. const char* emptyStr = "";
  350. };
  351. // _____________________________________________________________________________
  352. inline file::file(const std::string& path)
  353. : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
  354. _buffer = new char*[2];
  355. _buffer[0] = new char[BUFFER_S + 1];
  356. _buffer[1] = new char[BUFFER_S + 1];
  357. reset();
  358. }
  359. // _____________________________________________________________________________
  360. inline file::~file() {
  361. delete[] _buffer[0];
  362. delete[] _buffer[1];
  363. delete[] _buffer;
  364. close(_file);
  365. }
  366. // _____________________________________________________________________________
  367. inline void file::reset() {
  368. _which = 0;
  369. _s.s = NONE;
  370. _s.hanging = 0;
  371. _totReadBef = 0;
  372. if (_file) close(_file);
  373. _file = open(_path.c_str(), O_RDONLY);
  374. if (_file < 0)
  375. throw parse_exc(std::string("could not open file"), _path, 0, 0, 0);
  376. #ifdef __unix__
  377. posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
  378. #endif
  379. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  380. _lastNewData = _lastBytes;
  381. _c = _buffer[_which];
  382. while (!_s.tagStack.empty()) _s.tagStack.pop();
  383. _s.tagStack.push("[root]");
  384. _prevs = _s;
  385. }
  386. // _____________________________________________________________________________
  387. inline size_t file::level() const { return _s.tagStack.size() - _s.hanging; }
  388. // _____________________________________________________________________________
  389. inline parser_state file::state() { return _prevs; }
  390. // _____________________________________________________________________________
  391. inline void file::set_state(const parser_state& s) {
  392. _s = s;
  393. _prevs = s;
  394. lseek(_file, _s.off, SEEK_SET);
  395. _totReadBef = _s.off;
  396. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  397. _lastNewData = _lastBytes;
  398. _c = _buffer[_which];
  399. next();
  400. }
  401. // _____________________________________________________________________________
  402. inline const tag& file::get() const { return _ret; }
  403. // _____________________________________________________________________________
  404. inline bool file::next() {
  405. if (!_s.tagStack.size()) return false;
  406. // avoid too much stack copying
  407. if (_prevs.tagStack.size() != _s.tagStack.size() ||
  408. _prevs.tagStack.top() != _s.tagStack.top()) {
  409. _prevs.tagStack = _s.tagStack;
  410. }
  411. _prevs.s = _s.s;
  412. _prevs.hanging = _s.hanging;
  413. _prevs.off =
  414. _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
  415. if (_s.hanging) _s.hanging--;
  416. _ret.name = 0;
  417. _ret.text = emptyStr;
  418. _ret.attrs.clear();
  419. void* i;
  420. while (_lastBytes) {
  421. for (; _c - _buffer[_which] < _lastBytes; ++_c) {
  422. char c = *_c;
  423. switch (_s.s) {
  424. case NONE:
  425. if (std::isspace(c))
  426. continue;
  427. else if (c == '<') {
  428. _s.s = IN_TAG_TENTATIVE;
  429. continue;
  430. }
  431. _s.s = IN_TEXT;
  432. _ret.name = emptyStr;
  433. _tmp = _c;
  434. continue;
  435. case IN_TEXT:
  436. i = memchr(_c, '<', _lastBytes - (_c - _buffer[_which]));
  437. if (!i) {
  438. _c = _buffer[_which] + _lastBytes;
  439. continue;
  440. }
  441. _c = (char*)i;
  442. *_c = 0;
  443. _ret.text = _tmp;
  444. _s.s = IN_TAG_TENTATIVE;
  445. _c++;
  446. return true;
  447. case IN_COMMENT_TENTATIVE:
  448. if (c == '-') {
  449. _s.s = IN_COMMENT_TENTATIVE2;
  450. continue;
  451. }
  452. throw parse_exc("Expected comment", _path, _c, _buffer[_which],
  453. _prevs.off);
  454. case IN_COMMENT_TENTATIVE2:
  455. if (c == '-') {
  456. _s.s = IN_COMMENT;
  457. continue;
  458. }
  459. throw parse_exc("Expected comment", _path, _c, _buffer[_which],
  460. _prevs.off);
  461. case IN_COMMENT_CL_TENTATIVE:
  462. if (c == '-') {
  463. _s.s = IN_COMMENT_CL_TENTATIVE2;
  464. continue;
  465. }
  466. _s.s = IN_COMMENT;
  467. continue;
  468. case IN_COMMENT_CL_TENTATIVE2:
  469. if (c == '>') {
  470. _s.s = NONE;
  471. continue;
  472. }
  473. _s.s = IN_COMMENT;
  474. // fall through, we are still in comment
  475. case IN_COMMENT:
  476. i = memchr(_c, '-', _lastBytes - (_c - _buffer[_which]));
  477. if (!i) {
  478. _c = _buffer[_which] + _lastBytes;
  479. continue;
  480. }
  481. _c = (char*)i;
  482. _s.s = IN_COMMENT_CL_TENTATIVE;
  483. continue;
  484. case IN_TAG_TENTATIVE:
  485. if (c == '/') {
  486. _s.s = IN_TAG_NAME_CLOSE;
  487. _tmp = _c + 1;
  488. continue;
  489. } else if (c == '?') {
  490. _s.s = IN_TAG_NAME_META;
  491. continue;
  492. } else if (c == '!') {
  493. _s.s = IN_COMMENT_TENTATIVE;
  494. continue;
  495. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  496. _s.s = IN_TAG_NAME;
  497. _ret.name = _c;
  498. continue;
  499. }
  500. case IN_TAG:
  501. if (std::isspace(c))
  502. continue;
  503. else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  504. _s.s = IN_ATTRKEY;
  505. _tmp = _c;
  506. continue;
  507. } else if (c == '/') {
  508. _s.s = AW_CLOSING;
  509. continue;
  510. } else if (c == '>') {
  511. _s.hanging++;
  512. _s.tagStack.push(_ret.name);
  513. _s.s = WS_SKIP;
  514. continue;
  515. }
  516. throw parse_exc("Expected valid tag", _path, _c, _buffer[_which],
  517. _prevs.off);
  518. case IN_ATTRVAL_SQ:
  519. i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
  520. if (!i) {
  521. _c = _buffer[_which] + _lastBytes;
  522. continue;
  523. }
  524. _c = (char*)i;
  525. _s.s = IN_TAG;
  526. *_c = 0;
  527. _ret.attrs[_tmp] = _tmp2;
  528. continue;
  529. case IN_ATTRVAL_DQ:
  530. i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
  531. if (!i) {
  532. _c = _buffer[_which] + _lastBytes;
  533. continue;
  534. }
  535. _c = (char*)i;
  536. _s.s = IN_TAG;
  537. *_c = 0;
  538. _ret.attrs[_tmp] = _tmp2;
  539. continue;
  540. case AW_IN_ATTRVAL:
  541. if (std::isspace(c))
  542. continue;
  543. else if (c == '\'') {
  544. _s.s = IN_ATTRVAL_SQ;
  545. _tmp2 = _c + 1;
  546. continue;
  547. } else if (c == '"') {
  548. _s.s = IN_ATTRVAL_DQ;
  549. _tmp2 = _c + 1;
  550. continue;
  551. }
  552. throw parse_exc("Expected attribute value", _path, _c,
  553. _buffer[_which], _prevs.off);
  554. case IN_ATTRKEY:
  555. if (std::isspace(c)) {
  556. *_c = 0;
  557. _s.s = AFTER_ATTRKEY;
  558. continue;
  559. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  560. continue;
  561. } else if (c == '=') {
  562. *_c = 0;
  563. _s.s = AW_IN_ATTRVAL;
  564. continue;
  565. }
  566. throw parse_exc("Expected attribute key char or =", _path, _c,
  567. _buffer[_which], _prevs.off);
  568. case AFTER_ATTRKEY:
  569. if (std::isspace(c))
  570. continue;
  571. else if (c == '=') {
  572. _s.s = AW_IN_ATTRVAL;
  573. continue;
  574. }
  575. throw parse_exc(
  576. std::string("Expected attribute value for '") + _tmp + "'.",
  577. _path, _c, _buffer[_which], _prevs.off);
  578. case IN_TAG_NAME:
  579. if (std::isspace(c)) {
  580. *_c = 0;
  581. _s.s = IN_TAG;
  582. continue;
  583. } else if (c == '>') {
  584. *_c = 0;
  585. _s.hanging++;
  586. _s.tagStack.push(_ret.name);
  587. _s.s = WS_SKIP;
  588. continue;
  589. } else if (c == '/') {
  590. *_c = 0;
  591. _s.s = AW_CLOSING;
  592. continue;
  593. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  594. continue;
  595. }
  596. case IN_TAG_NAME_META:
  597. // TODO: read meta tags!
  598. if (c == '>') {
  599. _s.s = NONE;
  600. continue;
  601. }
  602. continue;
  603. case IN_TAG_NAME_CLOSE:
  604. if (std::isspace(c)) {
  605. *_c = 0;
  606. _s.s = IN_TAG_CLOSE;
  607. continue;
  608. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  609. continue;
  610. } else if (c == '>') {
  611. *_c = 0;
  612. if (_tmp != _s.tagStack.top()) {
  613. throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
  614. ">', expected close of '<" +
  615. _s.tagStack.top() + ">'.",
  616. _path, _c, _buffer[_which], _prevs.off);
  617. }
  618. _s.tagStack.pop();
  619. _s.s = NONE;
  620. continue;
  621. }
  622. case IN_TAG_CLOSE:
  623. if (std::isspace(c))
  624. continue;
  625. else if (c == '>') {
  626. if (_tmp != _s.tagStack.top()) {
  627. throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
  628. ">', expected close of '<" +
  629. _s.tagStack.top() + ">'.",
  630. _path, _c, _buffer[_which], _prevs.off);
  631. }
  632. _s.tagStack.pop();
  633. _s.s = NONE;
  634. continue;
  635. }
  636. throw parse_exc("Expected '>'", _path, _c, _buffer[_which],
  637. _prevs.off);
  638. case AW_CLOSING:
  639. if (c == '>') {
  640. _s.s = WS_SKIP;
  641. continue;
  642. }
  643. case WS_SKIP:
  644. if (std::isspace(c)) continue;
  645. _s.s = NONE;
  646. return true;
  647. }
  648. }
  649. // buffer ended, read new stuff, but copy remaining if needed
  650. size_t off = 0;
  651. if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) {
  652. off = _lastBytes - (_ret.name - _buffer[_which]);
  653. memmove(_buffer[!_which], _ret.name, off);
  654. _ret.name = _buffer[!_which];
  655. } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
  656. _s.s == IN_TEXT) {
  657. off = _lastBytes - (_tmp - _buffer[_which]);
  658. memmove(_buffer[!_which], _tmp, off);
  659. _tmp = _buffer[!_which];
  660. } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
  661. off = _lastBytes - (_tmp2 - _buffer[_which]);
  662. memmove(_buffer[!_which], _tmp2, off);
  663. _tmp2 = _buffer[!_which];
  664. }
  665. assert(off <= BUFFER_S);
  666. size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
  667. if (!readb) break;
  668. _totReadBef += _lastNewData;
  669. _which = !_which;
  670. _lastNewData = readb;
  671. _lastBytes = _lastNewData + off;
  672. _c = _buffer[_which] + off;
  673. }
  674. if (_s.tagStack.size()) {
  675. if (_s.tagStack.top() != "[root]") {
  676. throw parse_exc("XML tree not complete", _path, _c, _buffer[_which],
  677. _prevs.off);
  678. }
  679. _s.tagStack.pop();
  680. }
  681. _s.s = NONE;
  682. _ret.name = "[root]";
  683. return false;
  684. }
  685. // _____________________________________________________________________________
  686. inline std::string file::decode(const std::string& str) {
  687. return decode(str.c_str());
  688. }
  689. // _____________________________________________________________________________
  690. inline std::string file::decode(const char* str) {
  691. const char* c = strchr(str, '&');
  692. if (!c) return str;
  693. char* decRet = new char[strlen(str) + 1];
  694. const char* last = str;
  695. char* dstPt = decRet;
  696. for (; c != 0; c = strchr(c + 1, '&')) {
  697. memcpy(dstPt, last, c - last);
  698. dstPt += c - last;
  699. last = c;
  700. if (*(c + 1) == '#') {
  701. uint64_t cp = -1;
  702. char* tail;
  703. errno = 0;
  704. if (*(c + 2) == 'x' || *(c + 2) == 'X')
  705. cp = strtoul(c + 3, &tail, 16);
  706. else
  707. cp = strtoul(c + 2, &tail, 10);
  708. if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
  709. dstPt += utf8(cp, dstPt);
  710. last = tail + 1;
  711. }
  712. } else {
  713. const char* e = strchr(c, ';');
  714. if (e) {
  715. char* ent = new char[e - 1 - c + 1];
  716. memcpy(ent, c + 1, e - 1 - c);
  717. ent[e - 1 - c] = 0;
  718. const auto it = ENTITIES.find(ent);
  719. if (it != ENTITIES.end()) {
  720. const char* utf8 = it->second;
  721. memcpy(dstPt, utf8, strlen(utf8));
  722. dstPt += strlen(utf8);
  723. last += strlen(ent) + 2;
  724. }
  725. delete[] ent;
  726. }
  727. }
  728. }
  729. strcpy(dstPt, last);
  730. std::string ret(decRet);
  731. delete[] decRet;
  732. return ret;
  733. }
  734. // _____________________________________________________________________________
  735. inline size_t file::utf8(size_t cp, char* out) {
  736. if (cp <= 0x7F) {
  737. out[0] = cp & 0x7F;
  738. return 1;
  739. } else if (cp <= 0x7FF) {
  740. out[0] = 0xC0 | (cp >> 6);
  741. out[1] = 0x80 | (cp & 0x3F);
  742. return 2;
  743. } else if (cp <= 0xFFFF) {
  744. out[0] = 0xE0 | (cp >> 12);
  745. out[1] = 0x80 | ((cp >> 6) & 0x3F);
  746. out[2] = 0x80 | (cp & 0x3F);
  747. return 3;
  748. } else if (cp <= 0x1FFFFF) {
  749. out[0] = 0xF0 | (cp >> 18);
  750. out[1] = 0x80 | ((cp >> 12) & 0x3F);
  751. out[2] = 0x80 | ((cp >> 6) & 0x3F);
  752. out[3] = 0x80 | (cp & 0x3F);
  753. return 4;
  754. }
  755. return 0;
  756. }
  757. }
  758. #endif // PFXML_H_