File.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. // Copyright 2017 Patrick Brosi
  2. // info@patrickbrosi.de
  3. #include <fcntl.h>
  4. #include <sys/stat.h>
  5. #include <sys/types.h>
  6. #include <unistd.h>
  7. #include <cassert>
  8. #include <cstring>
  9. #include <fstream>
  10. #include <iostream>
  11. #include <map>
  12. #include "xml/File.h"
  13. #include "xml/NamedEnts.h"
  14. using namespace xml;
  15. // _____________________________________________________________________________
  16. File::File(const std::string& path)
  17. : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
  18. _buffer = new char*[2];
  19. _buffer[0] = new char[BUFFER_S + 1];
  20. _buffer[1] = new char[BUFFER_S + 1];
  21. reset();
  22. }
  23. // _____________________________________________________________________________
  24. File::~File() {
  25. delete[] _buffer[0];
  26. delete[] _buffer[1];
  27. delete[] _buffer;
  28. close(_file);
  29. }
  30. // _____________________________________________________________________________
  31. void File::reset() {
  32. _which = 0;
  33. _s.s = NONE;
  34. _s.hanging = 0;
  35. _totReadBef = 0;
  36. if (_file) close(_file);
  37. _file = open(_path.c_str(), O_RDONLY);
  38. if (_file < 0) throw XmlFileException(std::string("could not open ") + _path);
  39. #ifdef __unix__
  40. posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
  41. #endif
  42. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  43. _lastNewData = _lastBytes;
  44. _c = _buffer[_which];
  45. while (!_s.tagStack.empty()) _s.tagStack.pop();
  46. _s.tagStack.push("[root]");
  47. _prevs = _s;
  48. }
  49. // _____________________________________________________________________________
  50. size_t File::level() const { return _s.tagStack.size() - _s.hanging; }
  51. // _____________________________________________________________________________
  52. ParserState File::state() { return _prevs; }
  53. // _____________________________________________________________________________
  54. void File::setState(const ParserState& s) {
  55. _s = s;
  56. _prevs = s;
  57. lseek(_file, _s.off, SEEK_SET);
  58. _totReadBef = _s.off;
  59. _lastBytes = read(_file, _buffer[_which], BUFFER_S);
  60. _lastNewData = _lastBytes;
  61. _c = _buffer[_which];
  62. next();
  63. }
  64. // _____________________________________________________________________________
  65. const Tag& File::get() const { return _ret; }
  66. // _____________________________________________________________________________
  67. bool File::next() {
  68. if (!_s.tagStack.size()) return false;
  69. // avoid too much stack copying
  70. if (_prevs.tagStack.size() != _s.tagStack.size() ||
  71. _prevs.tagStack.top() != _s.tagStack.top()) {
  72. _prevs.tagStack = _s.tagStack;
  73. }
  74. _prevs.s = _s.s;
  75. _prevs.hanging = _s.hanging;
  76. _prevs.off =
  77. _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
  78. if (_s.hanging) _s.hanging--;
  79. _ret.name = 0;
  80. _ret.attrs.clear();
  81. void* i;
  82. while (_lastBytes) {
  83. for (; _c - _buffer[_which] < _lastBytes; ++_c) {
  84. char c = *_c;
  85. switch (_s.s) {
  86. case NONE:
  87. if (std::isspace(c))
  88. continue;
  89. else if (c == '<') {
  90. _s.s = IN_TAG_TENTATIVE;
  91. continue;
  92. } else {
  93. _s.s = IN_TEXT;
  94. continue;
  95. }
  96. case IN_TEXT:
  97. throw XmlFileException("text nodes not yet supported");
  98. case IN_TAG_TENTATIVE:
  99. if (c == '/') {
  100. _s.s = IN_TAG_NAME_CLOSE;
  101. _tmp = _c + 1;
  102. continue;
  103. } else if (c == '?') {
  104. _s.s = IN_TAG_NAME_META;
  105. continue;
  106. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  107. _s.s = IN_TAG_NAME;
  108. _ret.name = _c;
  109. continue;
  110. }
  111. case IN_TAG:
  112. if (std::isspace(c))
  113. continue;
  114. else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  115. _s.s = IN_ATTRKEY;
  116. _tmp = _c;
  117. continue;
  118. } else if (c == '/') {
  119. _s.s = AW_CLOSING;
  120. continue;
  121. } else if (c == '>') {
  122. _s.hanging++;
  123. _s.tagStack.push(_ret.name);
  124. _s.s = WS_SKIP;
  125. continue;
  126. } else {
  127. throw XmlFileException("expected valid tag");
  128. }
  129. case IN_ATTRVAL_SQ:
  130. i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
  131. if (!i) {
  132. _c = _buffer[_which] + _lastBytes;
  133. continue;
  134. } else {
  135. _c = (char*)i;
  136. _s.s = IN_TAG;
  137. *_c = 0;
  138. _ret.attrs[_tmp] = _tmp2;
  139. continue;
  140. }
  141. case IN_ATTRVAL_DQ:
  142. i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
  143. if (!i) {
  144. _c = _buffer[_which] + _lastBytes;
  145. continue;
  146. } else {
  147. _c = (char*)i;
  148. _s.s = IN_TAG;
  149. *_c = 0;
  150. _ret.attrs[_tmp] = _tmp2;
  151. continue;
  152. }
  153. case AW_IN_ATTRVAL:
  154. if (std::isspace(c))
  155. continue;
  156. else if (c == '\'') {
  157. _s.s = IN_ATTRVAL_SQ;
  158. _tmp2 = _c + 1;
  159. continue;
  160. } else if (c == '"') {
  161. _s.s = IN_ATTRVAL_DQ;
  162. _tmp2 = _c + 1;
  163. continue;
  164. } else {
  165. throw XmlFileException("expected attribute value");
  166. }
  167. case IN_ATTRKEY:
  168. if (std::isspace(c)) {
  169. *_c = 0;
  170. _s.s = AFTER_ATTRKEY;
  171. continue;
  172. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  173. continue;
  174. } else if (c == '=') {
  175. *_c = 0;
  176. _s.s = AW_IN_ATTRVAL;
  177. continue;
  178. }
  179. throw XmlFileException("expected attribute key char or =");
  180. case AFTER_ATTRKEY:
  181. if (std::isspace(c))
  182. continue;
  183. else if (c == '=') {
  184. _s.s = AW_IN_ATTRVAL;
  185. continue;
  186. } else {
  187. // TODO: error
  188. continue;
  189. }
  190. case IN_TAG_NAME:
  191. if (std::isspace(c)) {
  192. *_c = 0;
  193. _s.s = IN_TAG;
  194. continue;
  195. } else if (c == '>') {
  196. *_c = 0;
  197. _s.hanging++;
  198. _s.tagStack.push(_ret.name);
  199. _s.s = WS_SKIP;
  200. continue;
  201. } else if (c == '/') {
  202. *_c = 0;
  203. _s.s = AW_CLOSING;
  204. continue;
  205. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  206. continue;
  207. }
  208. case IN_TAG_NAME_META:
  209. // TODO: read meta tags!
  210. if (c == '>') {
  211. _s.s = NONE;
  212. continue;
  213. }
  214. continue;
  215. case IN_TAG_NAME_CLOSE:
  216. if (std::isspace(c)) {
  217. *_c = 0;
  218. _s.s = IN_TAG_CLOSE;
  219. continue;
  220. } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
  221. continue;
  222. } else if (c == '>') {
  223. *_c = 0;
  224. if (_tmp != _s.tagStack.top()) {
  225. throw XmlFileException("closing wrong tag");
  226. }
  227. _s.tagStack.pop();
  228. _s.s = NONE;
  229. continue;
  230. }
  231. case IN_TAG_CLOSE:
  232. if (std::isspace(c))
  233. continue;
  234. else if (c == '>') {
  235. if (_tmp != _s.tagStack.top()) {
  236. throw XmlFileException("closing wrong tag");
  237. }
  238. _s.tagStack.pop();
  239. _s.s = NONE;
  240. continue;
  241. } else {
  242. throw XmlFileException("expected '>'");
  243. }
  244. case AW_CLOSING:
  245. if (c == '>') {
  246. _s.s = WS_SKIP;
  247. continue;
  248. }
  249. case WS_SKIP:
  250. if (std::isspace(c))
  251. continue;
  252. else {
  253. _s.s = NONE;
  254. return true;
  255. }
  256. }
  257. }
  258. // buffer ended, read new stuff, but copy remaining if needed
  259. size_t off = 0;
  260. if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) {
  261. off = _lastBytes - (_ret.name - _buffer[_which]);
  262. memmove(_buffer[!_which], _ret.name, off);
  263. _ret.name = _buffer[!_which];
  264. } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
  265. _s.s == IN_TEXT) {
  266. off = _lastBytes - (_tmp - _buffer[_which]);
  267. memmove(_buffer[!_which], _tmp, off);
  268. _tmp = _buffer[!_which];
  269. } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
  270. off = _lastBytes - (_tmp2 - _buffer[_which]);
  271. memmove(_buffer[!_which], _tmp2, off);
  272. _tmp2 = _buffer[!_which];
  273. }
  274. assert(off <= BUFFER_S);
  275. size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
  276. if (!readb) break;
  277. _totReadBef += _lastNewData;
  278. _which = !_which;
  279. _lastNewData = readb;
  280. _lastBytes = _lastNewData + off;
  281. _c = _buffer[_which] + off;
  282. }
  283. if (_s.tagStack.size()) {
  284. if (_s.tagStack.top() != "[root]") {
  285. // TODO error
  286. throw XmlFileException("XML tree not complete");
  287. } else {
  288. _s.tagStack.pop();
  289. }
  290. }
  291. _s.s = NONE;
  292. _ret.name = "[root]";
  293. return false;
  294. }
  295. // _____________________________________________________________________________
  296. std::string File::decode(const std::string& str) { return decode(str.c_str()); }
  297. // _____________________________________________________________________________
  298. std::string File::decode(const char* str) {
  299. const char* c = strchr(str, '&');
  300. if (!c) return str;
  301. char* decRet = new char[strlen(str) + 1];
  302. const char* last = str;
  303. char* dstPt = decRet;
  304. for (; c != 0; c = strchr(c + 1, '&')) {
  305. memcpy(dstPt, last, c - last);
  306. dstPt += c - last;
  307. last = c;
  308. if (*(c + 1) == '#') {
  309. uint64_t cp = -1;
  310. char* tail;
  311. errno = 0;
  312. if (*(c + 2) == 'x' || *(c + 2) == 'X')
  313. cp = strtoul(c + 3, &tail, 16);
  314. else
  315. cp = strtoul(c + 2, &tail, 10);
  316. if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
  317. dstPt += utf8(cp, dstPt);
  318. last = tail + 1;
  319. }
  320. } else {
  321. const char* e = strchr(c, ';');
  322. if (e) {
  323. char* ent = new char[e - 1 - c + 1];
  324. memcpy(ent, c + 1, e - 1 - c);
  325. ent[e - 1 - c] = 0;
  326. const auto it = xml::ENTITIES.find(ent);
  327. if (it != xml::ENTITIES.end()) {
  328. const char* utf8 = it->second;
  329. memcpy(dstPt, utf8, strlen(utf8));
  330. dstPt += strlen(utf8);
  331. last += strlen(ent) + 2;
  332. }
  333. delete[] ent;
  334. }
  335. }
  336. }
  337. strcpy(dstPt, last);
  338. std::string ret(decRet);
  339. delete[] decRet;
  340. return ret;
  341. }
  342. // _____________________________________________________________________________
  343. size_t File::utf8(size_t cp, char* out) {
  344. if (cp <= 0x7F) {
  345. out[0] = cp & 0x7F;
  346. return 1;
  347. } else if (cp <= 0x7FF) {
  348. out[0] = 0xC0 | (cp >> 6);
  349. out[1] = 0x80 | (cp & 0x3F);
  350. return 2;
  351. } else if (cp <= 0xFFFF) {
  352. out[0] = 0xE0 | (cp >> 12);
  353. out[1] = 0x80 | ((cp >> 6) & 0x3F);
  354. out[2] = 0x80 | (cp & 0x3F);
  355. return 3;
  356. } else if (cp <= 0x1FFFFF) {
  357. out[0] = 0xF0 | (cp >> 18);
  358. out[1] = 0x80 | ((cp >> 12) & 0x3F);
  359. out[2] = 0x80 | ((cp >> 6) & 0x3F);
  360. out[3] = 0x80 | (cp & 0x3F);
  361. return 4;
  362. }
  363. return 0;
  364. }