Patrick Brosi 5 年 前
コミット
759c9a9744
共有6 個のファイルを変更した834 個の追加873 個の削除を含む
  1. 0 2
      CMakeLists.txt
  2. 0 478
      File.cpp
  3. 0 118
      File.h
  4. 0 272
      NamedEnts.h
  5. 5 3
      README.md
  6. 829 0
      pfxml.h

+ 0 - 2
CMakeLists.txt

@@ -1,2 +0,0 @@
1
-file(GLOB_RECURSE xml_SRC *.cpp)
2
-add_library(xml ${xml_SRC})

+ 0 - 478
File.cpp

@@ -1,478 +0,0 @@
1
-// Copyright 2017 Patrick Brosi
2
-// info@patrickbrosi.de
3
-
4
-#include <fcntl.h>
5
-#include <sys/stat.h>
6
-#include <sys/types.h>
7
-#include <unistd.h>
8
-#include <cassert>
9
-#include <cstring>
10
-#include <fstream>
11
-#include <iostream>
12
-#include <map>
13
-#include "xml/File.h"
14
-#include "xml/NamedEnts.h"
15
-
16
-using namespace xml;
17
-
18
-// _____________________________________________________________________________
19
-File::File(const std::string& path)
20
-    : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
21
-  _buffer = new char*[2];
22
-  _buffer[0] = new char[BUFFER_S + 1];
23
-  _buffer[1] = new char[BUFFER_S + 1];
24
-
25
-  reset();
26
-}
27
-
28
-// _____________________________________________________________________________
29
-File::~File() {
30
-  delete[] _buffer[0];
31
-  delete[] _buffer[1];
32
-  delete[] _buffer;
33
-  close(_file);
34
-}
35
-
36
-// _____________________________________________________________________________
37
-void File::reset() {
38
-  _which = 0;
39
-  _s.s = NONE;
40
-  _s.hanging = 0;
41
-  _totReadBef = 0;
42
-
43
-  if (_file) close(_file);
44
-  _file = open(_path.c_str(), O_RDONLY);
45
-  if (_file < 0)
46
-    throw XmlFileException(std::string("could not open file"), _path, 0, 0, 0);
47
-#ifdef __unix__
48
-  posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
49
-#endif
50
-
51
-  _lastBytes = read(_file, _buffer[_which], BUFFER_S);
52
-  _lastNewData = _lastBytes;
53
-  _c = _buffer[_which];
54
-  while (!_s.tagStack.empty()) _s.tagStack.pop();
55
-  _s.tagStack.push("[root]");
56
-  _prevs = _s;
57
-}
58
-
59
-// _____________________________________________________________________________
60
-size_t File::level() const { return _s.tagStack.size() - _s.hanging; }
61
-
62
-// _____________________________________________________________________________
63
-ParserState File::state() { return _prevs; }
64
-
65
-// _____________________________________________________________________________
66
-void File::setState(const ParserState& s) {
67
-  _s = s;
68
-  _prevs = s;
69
-
70
-  lseek(_file, _s.off, SEEK_SET);
71
-  _totReadBef = _s.off;
72
-  _lastBytes = read(_file, _buffer[_which], BUFFER_S);
73
-  _lastNewData = _lastBytes;
74
-  _c = _buffer[_which];
75
-
76
-  next();
77
-}
78
-
79
-// _____________________________________________________________________________
80
-const Tag& File::get() const { return _ret; }
81
-
82
-// _____________________________________________________________________________
83
-bool File::next() {
84
-  if (!_s.tagStack.size()) return false;
85
-  // avoid too much stack copying
86
-  if (_prevs.tagStack.size() != _s.tagStack.size() ||
87
-      _prevs.tagStack.top() != _s.tagStack.top()) {
88
-    _prevs.tagStack = _s.tagStack;
89
-  }
90
-  _prevs.s = _s.s;
91
-  _prevs.hanging = _s.hanging;
92
-  _prevs.off =
93
-      _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
94
-
95
-  if (_s.hanging) _s.hanging--;
96
-  _ret.name = 0;
97
-  _ret.text = emptyStr;
98
-  _ret.attrs.clear();
99
-  void* i;
100
-  while (_lastBytes) {
101
-    for (; _c - _buffer[_which] < _lastBytes; ++_c) {
102
-      char c = *_c;
103
-      switch (_s.s) {
104
-        case NONE:
105
-          if (std::isspace(c))
106
-            continue;
107
-          else if (c == '<') {
108
-            _s.s = IN_TAG_TENTATIVE;
109
-            continue;
110
-          } else {
111
-            _s.s = IN_TEXT;
112
-            _ret.name = emptyStr;
113
-            _tmp = _c;
114
-            continue;
115
-          }
116
-        case IN_TEXT:
117
-          i = memchr(_c, '<', _lastBytes - (_c - _buffer[_which]));
118
-          if (!i) {
119
-            _c = _buffer[_which] + _lastBytes;
120
-            continue;
121
-          } else {
122
-            _c = (char*)i;
123
-            *_c = 0;
124
-            _ret.text = _tmp;
125
-            _s.s = IN_TAG_TENTATIVE;
126
-            _c++;
127
-            return true;
128
-          }
129
-        case IN_COMMENT_TENTATIVE:
130
-          if (c == '-') {
131
-            _s.s = IN_COMMENT_TENTATIVE2;
132
-            continue;
133
-          }
134
-          throw XmlFileException("Expected comment", _path, _c, _buffer[_which],
135
-                                 _prevs.off);
136
-
137
-        case IN_COMMENT_TENTATIVE2:
138
-          if (c == '-') {
139
-            _s.s = IN_COMMENT;
140
-            continue;
141
-          }
142
-          throw XmlFileException("Expected comment", _path, _c, _buffer[_which],
143
-                                 _prevs.off);
144
-
145
-        case IN_COMMENT_CL_TENTATIVE:
146
-          if (c == '-') {
147
-            _s.s = IN_COMMENT_CL_TENTATIVE2;
148
-            continue;
149
-          }
150
-          _s.s = IN_COMMENT;
151
-          continue;
152
-
153
-        case IN_COMMENT_CL_TENTATIVE2:
154
-          if (c == '>') {
155
-            _s.s = NONE;
156
-            continue;
157
-          }
158
-          _s.s = IN_COMMENT;
159
-        // fall through, we are still in comment
160
-
161
-        case IN_COMMENT:
162
-          i = memchr(_c, '-', _lastBytes - (_c - _buffer[_which]));
163
-          if (!i) {
164
-            _c = _buffer[_which] + _lastBytes;
165
-            continue;
166
-          } else {
167
-            _c = (char*)i;
168
-            _s.s = IN_COMMENT_CL_TENTATIVE;
169
-            continue;
170
-          }
171
-        case IN_TAG_TENTATIVE:
172
-          if (c == '/') {
173
-            _s.s = IN_TAG_NAME_CLOSE;
174
-            _tmp = _c + 1;
175
-            continue;
176
-          } else if (c == '?') {
177
-            _s.s = IN_TAG_NAME_META;
178
-            continue;
179
-          } else if (c == '!') {
180
-            _s.s = IN_COMMENT_TENTATIVE;
181
-            continue;
182
-          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
183
-            _s.s = IN_TAG_NAME;
184
-            _ret.name = _c;
185
-            continue;
186
-          }
187
-
188
-        case IN_TAG:
189
-          if (std::isspace(c))
190
-            continue;
191
-          else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
192
-            _s.s = IN_ATTRKEY;
193
-            _tmp = _c;
194
-            continue;
195
-          } else if (c == '/') {
196
-            _s.s = AW_CLOSING;
197
-            continue;
198
-          } else if (c == '>') {
199
-            _s.hanging++;
200
-            _s.tagStack.push(_ret.name);
201
-            _s.s = WS_SKIP;
202
-            continue;
203
-          } else {
204
-            throw XmlFileException("Expected valid tag", _path, _c,
205
-                                   _buffer[_which], _prevs.off);
206
-          }
207
-
208
-        case IN_ATTRVAL_SQ:
209
-          i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
210
-          if (!i) {
211
-            _c = _buffer[_which] + _lastBytes;
212
-            continue;
213
-          } else {
214
-            _c = (char*)i;
215
-            _s.s = IN_TAG;
216
-            *_c = 0;
217
-            _ret.attrs[_tmp] = _tmp2;
218
-            continue;
219
-          }
220
-
221
-        case IN_ATTRVAL_DQ:
222
-          i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
223
-          if (!i) {
224
-            _c = _buffer[_which] + _lastBytes;
225
-            continue;
226
-          } else {
227
-            _c = (char*)i;
228
-            _s.s = IN_TAG;
229
-            *_c = 0;
230
-            _ret.attrs[_tmp] = _tmp2;
231
-            continue;
232
-          }
233
-
234
-        case AW_IN_ATTRVAL:
235
-          if (std::isspace(c))
236
-            continue;
237
-          else if (c == '\'') {
238
-            _s.s = IN_ATTRVAL_SQ;
239
-            _tmp2 = _c + 1;
240
-            continue;
241
-          } else if (c == '"') {
242
-            _s.s = IN_ATTRVAL_DQ;
243
-            _tmp2 = _c + 1;
244
-            continue;
245
-          } else {
246
-            throw XmlFileException("Expected attribute value", _path, _c,
247
-                                   _buffer[_which], _prevs.off);
248
-          }
249
-
250
-        case IN_ATTRKEY:
251
-          if (std::isspace(c)) {
252
-            *_c = 0;
253
-            _s.s = AFTER_ATTRKEY;
254
-            continue;
255
-          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
256
-            continue;
257
-          } else if (c == '=') {
258
-            *_c = 0;
259
-            _s.s = AW_IN_ATTRVAL;
260
-            continue;
261
-          }
262
-
263
-          throw XmlFileException("Expected attribute key char or =", _path, _c,
264
-                                 _buffer[_which], _prevs.off);
265
-
266
-        case AFTER_ATTRKEY:
267
-          if (std::isspace(c))
268
-            continue;
269
-          else if (c == '=') {
270
-            _s.s = AW_IN_ATTRVAL;
271
-            continue;
272
-          } else {
273
-            throw XmlFileException(
274
-                std::string("Expected attribute value for '") + _tmp + "'.",
275
-                _path, _c, _buffer[_which], _prevs.off);
276
-          }
277
-
278
-        case IN_TAG_NAME:
279
-          if (std::isspace(c)) {
280
-            *_c = 0;
281
-            _s.s = IN_TAG;
282
-            continue;
283
-          } else if (c == '>') {
284
-            *_c = 0;
285
-            _s.hanging++;
286
-            _s.tagStack.push(_ret.name);
287
-            _s.s = WS_SKIP;
288
-            continue;
289
-          } else if (c == '/') {
290
-            *_c = 0;
291
-            _s.s = AW_CLOSING;
292
-            continue;
293
-          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
294
-            continue;
295
-          }
296
-
297
-        case IN_TAG_NAME_META:
298
-          // TODO: read meta tags!
299
-          if (c == '>') {
300
-            _s.s = NONE;
301
-            continue;
302
-          }
303
-
304
-          continue;
305
-
306
-        case IN_TAG_NAME_CLOSE:
307
-          if (std::isspace(c)) {
308
-            *_c = 0;
309
-            _s.s = IN_TAG_CLOSE;
310
-            continue;
311
-          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
312
-            continue;
313
-          } else if (c == '>') {
314
-            *_c = 0;
315
-            if (_tmp != _s.tagStack.top()) {
316
-              throw XmlFileException(std::string("Closing wrong tag '<") + _tmp +
317
-                                         ">', expected close of '<" +
318
-                                         _s.tagStack.top() + ">'.",
319
-                                     _path, _c, _buffer[_which], _prevs.off);
320
-            }
321
-            _s.tagStack.pop();
322
-            _s.s = NONE;
323
-            continue;
324
-          }
325
-
326
-        case IN_TAG_CLOSE:
327
-          if (std::isspace(c))
328
-            continue;
329
-          else if (c == '>') {
330
-            if (_tmp != _s.tagStack.top()) {
331
-              throw XmlFileException(std::string("Closing wrong tag '<") + _tmp +
332
-                                         ">', expected close of '<" +
333
-                                         _s.tagStack.top() + ">'.",
334
-                                     _path, _c, _buffer[_which], _prevs.off);
335
-            }
336
-            _s.tagStack.pop();
337
-            _s.s = NONE;
338
-            continue;
339
-          } else {
340
-            throw XmlFileException("Expected '>'", _path, _c, _buffer[_which],
341
-                                   _prevs.off);
342
-          }
343
-
344
-        case AW_CLOSING:
345
-          if (c == '>') {
346
-            _s.s = WS_SKIP;
347
-            continue;
348
-          }
349
-
350
-        case WS_SKIP:
351
-          if (std::isspace(c))
352
-            continue;
353
-          else {
354
-            _s.s = NONE;
355
-            return true;
356
-          }
357
-      }
358
-    }
359
-
360
-    // buffer ended, read new stuff, but copy remaining if needed
361
-    size_t off = 0;
362
-    if (_s.s == IN_TAG_NAME) {  //|| IN_TAG_NAME_META) {
363
-      off = _lastBytes - (_ret.name - _buffer[_which]);
364
-      memmove(_buffer[!_which], _ret.name, off);
365
-      _ret.name = _buffer[!_which];
366
-    } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
367
-               _s.s == IN_TEXT) {
368
-      off = _lastBytes - (_tmp - _buffer[_which]);
369
-      memmove(_buffer[!_which], _tmp, off);
370
-      _tmp = _buffer[!_which];
371
-    } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
372
-      off = _lastBytes - (_tmp2 - _buffer[_which]);
373
-      memmove(_buffer[!_which], _tmp2, off);
374
-      _tmp2 = _buffer[!_which];
375
-    }
376
-
377
-    assert(off <= BUFFER_S);
378
-
379
-    size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
380
-    if (!readb) break;
381
-    _totReadBef += _lastNewData;
382
-    _which = !_which;
383
-    _lastNewData = readb;
384
-    _lastBytes = _lastNewData + off;
385
-    _c = _buffer[_which] + off;
386
-  }
387
-
388
-  if (_s.tagStack.size()) {
389
-    if (_s.tagStack.top() != "[root]") {
390
-      throw XmlFileException("XML tree not complete", _path, _c,
391
-                             _buffer[_which], _prevs.off);
392
-    } else {
393
-      _s.tagStack.pop();
394
-    }
395
-  }
396
-  _s.s = NONE;
397
-  _ret.name = "[root]";
398
-  return false;
399
-}
400
-
401
-// _____________________________________________________________________________
402
-std::string File::decode(const std::string& str) { return decode(str.c_str()); }
403
-
404
-// _____________________________________________________________________________
405
-std::string File::decode(const char* str) {
406
-  const char* c = strchr(str, '&');
407
-  if (!c) return str;
408
-
409
-  char* decRet = new char[strlen(str) + 1];
410
-  const char* last = str;
411
-  char* dstPt = decRet;
412
-
413
-  for (; c != 0; c = strchr(c + 1, '&')) {
414
-    memcpy(dstPt, last, c - last);
415
-    dstPt += c - last;
416
-    last = c;
417
-
418
-    if (*(c + 1) == '#') {
419
-      uint64_t cp = -1;
420
-      char* tail;
421
-      errno = 0;
422
-      if (*(c + 2) == 'x' || *(c + 2) == 'X')
423
-        cp = strtoul(c + 3, &tail, 16);
424
-      else
425
-        cp = strtoul(c + 2, &tail, 10);
426
-
427
-      if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
428
-        dstPt += utf8(cp, dstPt);
429
-        last = tail + 1;
430
-      }
431
-    } else {
432
-      const char* e = strchr(c, ';');
433
-      if (e) {
434
-        char* ent = new char[e - 1 - c + 1];
435
-        memcpy(ent, c + 1, e - 1 - c);
436
-        ent[e - 1 - c] = 0;
437
-        const auto it = xml::ENTITIES.find(ent);
438
-        if (it != xml::ENTITIES.end()) {
439
-          const char* utf8 = it->second;
440
-          memcpy(dstPt, utf8, strlen(utf8));
441
-          dstPt += strlen(utf8);
442
-          last += strlen(ent) + 2;
443
-        }
444
-        delete[] ent;
445
-      }
446
-    }
447
-  }
448
-
449
-  strcpy(dstPt, last);
450
-  std::string ret(decRet);
451
-  delete[] decRet;
452
-  return ret;
453
-}
454
-
455
-// _____________________________________________________________________________
456
-size_t File::utf8(size_t cp, char* out) {
457
-  if (cp <= 0x7F) {
458
-    out[0] = cp & 0x7F;
459
-    return 1;
460
-  } else if (cp <= 0x7FF) {
461
-    out[0] = 0xC0 | (cp >> 6);
462
-    out[1] = 0x80 | (cp & 0x3F);
463
-    return 2;
464
-  } else if (cp <= 0xFFFF) {
465
-    out[0] = 0xE0 | (cp >> 12);
466
-    out[1] = 0x80 | ((cp >> 6) & 0x3F);
467
-    out[2] = 0x80 | (cp & 0x3F);
468
-    return 3;
469
-  } else if (cp <= 0x1FFFFF) {
470
-    out[0] = 0xF0 | (cp >> 18);
471
-    out[1] = 0x80 | ((cp >> 12) & 0x3F);
472
-    out[2] = 0x80 | ((cp >> 6) & 0x3F);
473
-    out[3] = 0x80 | (cp & 0x3F);
474
-    return 4;
475
-  }
476
-
477
-  return 0;
478
-}

+ 0 - 118
File.h

@@ -1,118 +0,0 @@
1
-// Copyright 2017 Patrick Brosi
2
-// info@patrickbrosi.de
3
-
4
-#ifndef XML_FILE_H_
5
-#define XML_FILE_H_
6
-
7
-#include <cstring>
8
-#include <fstream>
9
-#include <map>
10
-#include <sstream>
11
-#include <stack>
12
-#include <string>
13
-
14
-namespace xml {
15
-
16
-const static size_t BUFFER_S = 16 * 1024;
17
-
18
-class XmlFileException : public std::exception {
19
- public:
20
-  XmlFileException(std::string msg, std::string file, const char* p, char* buff,
21
-                   size_t offset) {
22
-    std::stringstream ss;
23
-    ss << file << " at position " << (offset + (p - buff)) << ": " << msg;
24
-    _msg = ss.str();
25
-  }
26
-  ~XmlFileException() throw() {}
27
-
28
-  virtual const char* what() const throw() { return _msg.c_str(); };
29
-
30
- private:
31
-  std::string _msg;
32
-};
33
-
34
-enum State {
35
-  NONE,
36
-  IN_TAG_NAME,
37
-  IN_TAG_NAME_META,
38
-  IN_TAG,
39
-  IN_TAG_CLOSE,
40
-  IN_TAG_NAME_CLOSE,
41
-  IN_TAG_TENTATIVE,
42
-  IN_ATTRKEY,
43
-  AFTER_ATTRKEY,
44
-  AW_IN_ATTRVAL,
45
-  IN_ATTRVAL_SQ,
46
-  IN_ATTRVAL_DQ,
47
-  IN_TEXT,
48
-  IN_COMMENT_TENTATIVE,
49
-  IN_COMMENT_TENTATIVE2,
50
-  IN_COMMENT,
51
-  IN_COMMENT_CL_TENTATIVE,
52
-  IN_COMMENT_CL_TENTATIVE2,
53
-  AW_CLOSING,
54
-  WS_SKIP
55
-};
56
-
57
-struct AttrCmp {
58
-  bool operator()(const char* const& a, const char* const& b) const {
59
-    return std::strcmp(a, b) < 0;
60
-  }
61
-};
62
-
63
-struct ParserState {
64
-  ParserState() : s(NONE), hanging(0), off(0){};
65
-  std::stack<std::string> tagStack;
66
-  State s;
67
-  size_t hanging;
68
-  int64_t off;
69
-};
70
-
71
-typedef std::map<const char*, const char*, AttrCmp> AttrMap;
72
-
73
-struct Tag {
74
-  const char* name;
75
-  const char* text;
76
-  AttrMap attrs;
77
-};
78
-
79
-class File {
80
- public:
81
-  File(const std::string& path);
82
-  ~File();
83
-
84
-  const Tag& get() const;
85
-
86
-  bool next();
87
-  size_t level() const;
88
-  void reset();
89
-  ParserState state();
90
-  void setState(const ParserState& s);
91
-  static std::string decode(const char* str);
92
-  static std::string decode(const std::string& str);
93
-
94
- private:
95
-  int _file;
96
-  ParserState _s;
97
-  ParserState _prevs;
98
-  char** _buffer;
99
-  char* _c;
100
-  int64_t _lastBytes;
101
-
102
-  const char* _tmp;
103
-  const char* _tmp2;
104
-
105
-  size_t _which;
106
-  std::string _path;
107
-
108
-  int64_t _totReadBef;
109
-  int64_t _lastNewData;
110
-
111
-  Tag _ret;
112
-
113
-  static size_t utf8(size_t cp, char* out);
114
-  const char* emptyStr = "";
115
-};
116
-}
117
-
118
-#endif  // XML_FILE_H_

+ 0 - 272
NamedEnts.h

@@ -1,272 +0,0 @@
1
-// Copyright 2017 Patrick Brosi
2
-// info@patrickbrosi.de
3
-
4
-#ifndef XML_NAMEDENTS_H_
5
-#define XML_NAMEDENTS_H_
6
-
7
-#include <map>
8
-#include <string>
9
-
10
-namespace xml {
11
-
12
-// see
13
-// http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
14
-std::map<std::string, const char*> ENTITIES = {
15
-  {"aacute", "á"},
16
-  {"Aacute", "Á"},
17
-  {"acirc", "â"},
18
-  {"Acirc", "Â"},
19
-  {"acute", "´"},
20
-  {"aelig", "æ"},
21
-  {"AElig", "Æ"},
22
-  {"agrave", "à"},
23
-  {"Agrave", "À"},
24
-  {"alefsym", "ℵ"},
25
-  {"alpha", "α"},
26
-  {"Alpha", "Α"},
27
-  {"amp", "&"},
28
-  {"and", "∧"},
29
-  {"ang", "∠"},
30
-  {"apos", "'"},
31
-  {"aring", "å"},
32
-  {"Aring", "Å"},
33
-  {"asymp", "≈"},
34
-  {"atilde", "ã"},
35
-  {"Atilde", "Ã"},
36
-  {"auml", "ä"},
37
-  {"Auml", "Ä"},
38
-  {"bdquo", "„"},
39
-  {"beta", "β"},
40
-  {"Beta", "Β"},
41
-  {"brvbar", "¦"},
42
-  {"bull", "•"},
43
-  {"cap", "∩"},
44
-  {"ccedil", "ç"},
45
-  {"Ccedil", "Ç"},
46
-  {"cedil", "¸"},
47
-  {"cent", "¢"},
48
-  {"chi", "χ"},
49
-  {"Chi", "Χ"},
50
-  {"circ", "ˆ"},
51
-  {"clubs", "♣"},
52
-  {"cong", "≅"},
53
-  {"copy", "©"},
54
-  {"crarr", "↵"},
55
-  {"cup", "∪"},
56
-  {"curren", "¤"},
57
-  {"dagger", "†"},
58
-  {"Dagger", "‡"},
59
-  {"darr", "↓"},
60
-  {"dArr", "⇓"},
61
-  {"deg", "°"},
62
-  {"delta", "δ"},
63
-  {"Delta", "Δ"},
64
-  {"diams", "♦"},
65
-  {"divide", "÷"},
66
-  {"eacute", "é"},
67
-  {"Eacute", "É"},
68
-  {"ecirc", "ê"},
69
-  {"Ecirc", "Ê"},
70
-  {"egrave", "è"},
71
-  {"Egrave", "È"},
72
-  {"empty", "∅"},
73
-  {"emsp", "\xE2\x80\x83"},
74
-  {"ensp", "\xE2\x80\x82"},
75
-  {"epsilon", "ε"},
76
-  {"Epsilon", "Ε"},
77
-  {"equiv", "≡"},
78
-  {"eta", "η"},
79
-  {"Eta", "Η"},
80
-  {"eth", "ð"},
81
-  {"ETH", "Ð"},
82
-  {"euml", "ë"},
83
-  {"Euml", "Ë"},
84
-  {"euro", "€"},
85
-  {"exist", "∃"},
86
-  {"fnof", "ƒ"},
87
-  {"forall", "∀"},
88
-  {"frac12", "½"},
89
-  {"frac14", "¼"},
90
-  {"frac34", "¾"},
91
-  {"frasl", "⁄"},
92
-  {"gamma", "γ"},
93
-  {"Gamma", "Γ"},
94
-  {"ge", "≥"},
95
-  {"gt", ">"},
96
-  {"harr", "↔"},
97
-  {"hArr", "⇔"},
98
-  {"hearts", "♥"},
99
-  {"hellip", "…"},
100
-  {"iacute", "í"},
101
-  {"Iacute", "Í"},
102
-  {"icirc", "î"},
103
-  {"Icirc", "Î"},
104
-  {"iexcl", "¡"},
105
-  {"igrave", "ì"},
106
-  {"Igrave", "Ì"},
107
-  {"image", "ℑ"},
108
-  {"infin", "∞"},
109
-  {"int", "∫"},
110
-  {"iota", "ι"},
111
-  {"Iota", "Ι"},
112
-  {"iquest", "¿"},
113
-  {"isin", "∈"},
114
-  {"iuml", "ï"},
115
-  {"Iuml", "Ï"},
116
-  {"kappa", "κ"},
117
-  {"Kappa", "Κ"},
118
-  {"lambda", "λ"},
119
-  {"Lambda", "Λ"},
120
-  {"lang", "〈"},
121
-  {"laquo", "«"},
122
-  {"larr", "←"},
123
-  {"lArr", "⇐"},
124
-  {"lceil", "⌈"},
125
-  {"ldquo", "“"},
126
-  {"le", "≤"},
127
-  {"lfloor", "⌊"},
128
-  {"lowast", "∗"},
129
-  {"loz", "◊"},
130
-  {"lrm", "\xE2\x80\x8E"},
131
-  {"lsaquo", "‹"},
132
-  {"lsquo", "‘"},
133
-  {"lt", "<"},
134
-  {"macr", "¯"},
135
-  {"mdash", "—"},
136
-  {"micro", "µ"},
137
-  {"middot", "·"},
138
-  {"minus", "−"},
139
-  {"mu", "μ"},
140
-  {"Mu", "Μ"},
141
-  {"nabla", "∇"},
142
-  {"nbsp", "\xC2\xA0"},
143
-  {"ndash", "–"},
144
-  {"ne", "≠"},
145
-  {"ni", "∋"},
146
-  {"not", "¬"},
147
-  {"notin", "∉"},
148
-  {"nsub", "⊄"},
149
-  {"ntilde", "ñ"},
150
-  {"Ntilde", "Ñ"},
151
-  {"nu", "ν"},
152
-  {"Nu", "Ν"},
153
-  {"oacute", "ó"},
154
-  {"Oacute", "Ó"},
155
-  {"ocirc", "ô"},
156
-  {"Ocirc", "Ô"},
157
-  {"oelig", "œ"},
158
-  {"OElig", "Œ"},
159
-  {"ograve", "ò"},
160
-  {"Ograve", "Ò"},
161
-  {"oline", "‾"},
162
-  {"omega", "ω"},
163
-  {"Omega", "Ω"},
164
-  {"omicron", "ο"},
165
-  {"Omicron", "Ο"},
166
-  {"oplus", "⊕"},
167
-  {"or", "∨"},
168
-  {"ordf", "ª"},
169
-  {"ordm", "º"},
170
-  {"oslash", "ø"},
171
-  {"Oslash", "Ø"},
172
-  {"otilde", "õ"},
173
-  {"Otilde", "Õ"},
174
-  {"otimes", "⊗"},
175
-  {"ouml", "ö"},
176
-  {"Ouml", "Ö"},
177
-  {"para", "¶"},
178
-  {"part", "∂"},
179
-  {"permil", "‰"},
180
-  {"perp", "⊥"},
181
-  {"phi", "φ"},
182
-  {"Phi", "Φ"},
183
-  {"piv", "ϖ"},
184
-  {"pi", "π"},
185
-  {"Pi", "Π"},
186
-  {"plusmn", "±"},
187
-  {"pound", "£"},
188
-  {"prime", "′"},
189
-  {"Prime", "″"},
190
-  {"prod", "∏"},
191
-  {"prop", "∝"},
192
-  {"psi", "ψ"},
193
-  {"Psi", "Ψ"},
194
-  {"quot", "\""},
195
-  {"radic", "√"},
196
-  {"rang", "〉"},
197
-  {"raquo", "»"},
198
-  {"rarr", "→"},
199
-  {"rArr", "⇒"},
200
-  {"rceil", "⌉"},
201
-  {"rdquo", "”"},
202
-  {"real", "ℜ"},
203
-  {"reg", "®"},
204
-  {"rfloor", "⌋"},
205
-  {"rho", "ρ"},
206
-  {"Rho", "Ρ"},
207
-  {"rlm", "\xE2\x80\x8F"},
208
-  {"rsaquo", "›"},
209
-  {"rsquo", "’"},
210
-  {"sbquo", "‚"},
211
-  {"scaron", "š"},
212
-  {"Scaron", "Š"},
213
-  {"sdot", "⋅"},
214
-  {"sect", "§"},
215
-  {"shy", "\xC2\xAD"},
216
-  {"sigmaf", "ς"},
217
-  {"sigma", "σ"},
218
-  {"Sigma", "Σ"},
219
-  {"sim", "∼"},
220
-  {"spades", "♠"},
221
-  {"sub", "⊂"},
222
-  {"sube", "⊆"},
223
-  {"sum", "∑"},
224
-  {"sup", "⊃"},
225
-  {"sup1", "¹"},
226
-  {"sup2", "²"},
227
-  {"sup3", "³"},
228
-  {"supe", "⊇"},
229
-  {"szlig", "ß"},
230
-  {"tau", "τ"},
231
-  {"Tau", "Τ"},
232
-  {"there4", "∴"},
233
-  {"thetasym", "ϑ"},
234
-  {"theta", "θ"},
235
-  {"Theta", "Θ"},
236
-  {"thinsp", "\xE2\x80\x89"},
237
-  {"thorn", "þ"},
238
-  {"THORN", "Þ"},
239
-  {"tilde", "˜"},
240
-  {"times", "×"},
241
-  {"trade", "™"},
242
-  {"uacute", "ú"},
243
-  {"Uacute", "Ú"},
244
-  {"uarr", "↑"},
245
-  {"uArr", "⇑"},
246
-  {"ucirc", "û"},
247
-  {"Ucirc", "Û"},
248
-  {"ugrave", "ù"},
249
-  {"Ugrave", "Ù"},
250
-  {"uml", "¨"},
251
-  {"upsih", "ϒ"},
252
-  {"upsilon", "υ"},
253
-  {"Upsilon", "Υ"},
254
-  {"uuml", "ü"},
255
-  {"Uuml", "Ü"},
256
-  {"weierp", "℘"},
257
-  {"xi", "ξ"},
258
-  {"Xi", "Ξ"},
259
-  {"yacute", "ý"},
260
-  {"Yacute", "Ý"},
261
-  {"yen", "¥"},
262
-  {"yuml", "ÿ"},
263
-  {"Yuml", "Ÿ"},
264
-  {"zeta", "ζ"},
265
-  {"Zeta", "Ζ"},
266
-  {"zwj", "\xE2\x80\x8D"},
267
-  {"zwnj", "\xE2\x80\x8C"}
268
-};
269
-}
270
-
271
-#endif  // XML_NAMEDENTS_H_
272
-

+ 5 - 3
README.md

@@ -1,15 +1,17 @@
1 1
 # Pretty fast XML parser
2 2
 
3
-Simple XML parser with minimal copying. Designed for high-speed parsing of very large XML files (like OSM XML files).
3
+* Simple XML parser with minimal copying.
4
+* Designed for high-speed parsing of very large XML files (like OSM XML files).
5
+* Should be pretty fast.
4 6
 
5 7
 ## Usage
6 8
 
7 9
 ```
8
-#include "xml/File.h"
10
+#include "pfxml.h"
9 11
 
10 12
 [...]
11 13
 
12
-xml::File xml("myfile.xml");
14
+pfxml::file xml("myfile.xml");
13 15
 
14 16
 while (xml.next()) {
15 17
   const auto& cur = xml.get();

+ 829 - 0
pfxml.h

@@ -0,0 +1,829 @@
1
+// Copyright 2017 Patrick Brosi
2
+// info@patrickbrosi.de
3
+
4
+#ifndef PFXML_H_
5
+#define PFXML_H_
6
+
7
+#include <cstring>
8
+#include <fstream>
9
+#include <map>
10
+#include <sstream>
11
+#include <stack>
12
+#include <string>
13
+
14
+namespace pfxml {
15
+
16
+static const size_t BUFFER_S = 16 * 1024;
17
+
18
+enum state {
19
+  NONE,
20
+  IN_TAG_NAME,
21
+  IN_TAG_NAME_META,
22
+  IN_TAG,
23
+  IN_TAG_CLOSE,
24
+  IN_TAG_NAME_CLOSE,
25
+  IN_TAG_TENTATIVE,
26
+  IN_ATTRKEY,
27
+  AFTER_ATTRKEY,
28
+  AW_IN_ATTRVAL,
29
+  IN_ATTRVAL_SQ,
30
+  IN_ATTRVAL_DQ,
31
+  IN_TEXT,
32
+  IN_COMMENT_TENTATIVE,
33
+  IN_COMMENT_TENTATIVE2,
34
+  IN_COMMENT,
35
+  IN_COMMENT_CL_TENTATIVE,
36
+  IN_COMMENT_CL_TENTATIVE2,
37
+  AW_CLOSING,
38
+  WS_SKIP
39
+};
40
+
41
+// see
42
+// http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
43
+static const std::map<std::string, const char*> ENTITIES = {
44
+    {"aacute", "á"},
45
+    {"Aacute", "Á"},
46
+    {"acirc", "â"},
47
+    {"Acirc", "Â"},
48
+    {"acute", "´"},
49
+    {"aelig", "æ"},
50
+    {"AElig", "Æ"},
51
+    {"agrave", "à"},
52
+    {"Agrave", "À"},
53
+    {"alefsym", "ℵ"},
54
+    {"alpha", "α"},
55
+    {"Alpha", "Α"},
56
+    {"amp", "&"},
57
+    {"and", "∧"},
58
+    {"ang", "∠"},
59
+    {"apos", "'"},
60
+    {"aring", "å"},
61
+    {"Aring", "Å"},
62
+    {"asymp", "≈"},
63
+    {"atilde", "ã"},
64
+    {"Atilde", "Ã"},
65
+    {"auml", "ä"},
66
+    {"Auml", "Ä"},
67
+    {"bdquo", "„"},
68
+    {"beta", "β"},
69
+    {"Beta", "Β"},
70
+    {"brvbar", "¦"},
71
+    {"bull", "•"},
72
+    {"cap", "∩"},
73
+    {"ccedil", "ç"},
74
+    {"Ccedil", "Ç"},
75
+    {"cedil", "¸"},
76
+    {"cent", "¢"},
77
+    {"chi", "χ"},
78
+    {"Chi", "Χ"},
79
+    {"circ", "ˆ"},
80
+    {"clubs", "♣"},
81
+    {"cong", "≅"},
82
+    {"copy", "©"},
83
+    {"crarr", "↵"},
84
+    {"cup", "∪"},
85
+    {"curren", "¤"},
86
+    {"dagger", "†"},
87
+    {"Dagger", "‡"},
88
+    {"darr", "↓"},
89
+    {"dArr", "⇓"},
90
+    {"deg", "°"},
91
+    {"delta", "δ"},
92
+    {"Delta", "Δ"},
93
+    {"diams", "♦"},
94
+    {"divide", "÷"},
95
+    {"eacute", "é"},
96
+    {"Eacute", "É"},
97
+    {"ecirc", "ê"},
98
+    {"Ecirc", "Ê"},
99
+    {"egrave", "è"},
100
+    {"Egrave", "È"},
101
+    {"empty", "∅"},
102
+    {"emsp", "\xE2\x80\x83"},
103
+    {"ensp", "\xE2\x80\x82"},
104
+    {"epsilon", "ε"},
105
+    {"Epsilon", "Ε"},
106
+    {"equiv", "≡"},
107
+    {"eta", "η"},
108
+    {"Eta", "Η"},
109
+    {"eth", "ð"},
110
+    {"ETH", "Ð"},
111
+    {"euml", "ë"},
112
+    {"Euml", "Ë"},
113
+    {"euro", "€"},
114
+    {"exist", "∃"},
115
+    {"fnof", "ƒ"},
116
+    {"forall", "∀"},
117
+    {"frac12", "½"},
118
+    {"frac14", "¼"},
119
+    {"frac34", "¾"},
120
+    {"frasl", "⁄"},
121
+    {"gamma", "γ"},
122
+    {"Gamma", "Γ"},
123
+    {"ge", "≥"},
124
+    {"gt", ">"},
125
+    {"harr", "↔"},
126
+    {"hArr", "⇔"},
127
+    {"hearts", "♥"},
128
+    {"hellip", "…"},
129
+    {"iacute", "í"},
130
+    {"Iacute", "Í"},
131
+    {"icirc", "î"},
132
+    {"Icirc", "Î"},
133
+    {"iexcl", "¡"},
134
+    {"igrave", "ì"},
135
+    {"Igrave", "Ì"},
136
+    {"image", "ℑ"},
137
+    {"infin", "∞"},
138
+    {"int", "∫"},
139
+    {"iota", "ι"},
140
+    {"Iota", "Ι"},
141
+    {"iquest", "¿"},
142
+    {"isin", "∈"},
143
+    {"iuml", "ï"},
144
+    {"Iuml", "Ï"},
145
+    {"kappa", "κ"},
146
+    {"Kappa", "Κ"},
147
+    {"lambda", "λ"},
148
+    {"Lambda", "Λ"},
149
+    {"lang", "〈"},
150
+    {"laquo", "«"},
151
+    {"larr", "←"},
152
+    {"lArr", "⇐"},
153
+    {"lceil", "⌈"},
154
+    {"ldquo", "“"},
155
+    {"le", "≤"},
156
+    {"lfloor", "⌊"},
157
+    {"lowast", "∗"},
158
+    {"loz", "◊"},
159
+    {"lrm", "\xE2\x80\x8E"},
160
+    {"lsaquo", "‹"},
161
+    {"lsquo", "‘"},
162
+    {"lt", "<"},
163
+    {"macr", "¯"},
164
+    {"mdash", "—"},
165
+    {"micro", "µ"},
166
+    {"middot", "·"},
167
+    {"minus", "−"},
168
+    {"mu", "μ"},
169
+    {"Mu", "Μ"},
170
+    {"nabla", "∇"},
171
+    {"nbsp", "\xC2\xA0"},
172
+    {"ndash", "–"},
173
+    {"ne", "≠"},
174
+    {"ni", "∋"},
175
+    {"not", "¬"},
176
+    {"notin", "∉"},
177
+    {"nsub", "⊄"},
178
+    {"ntilde", "ñ"},
179
+    {"Ntilde", "Ñ"},
180
+    {"nu", "ν"},
181
+    {"Nu", "Ν"},
182
+    {"oacute", "ó"},
183
+    {"Oacute", "Ó"},
184
+    {"ocirc", "ô"},
185
+    {"Ocirc", "Ô"},
186
+    {"oelig", "œ"},
187
+    {"OElig", "Œ"},
188
+    {"ograve", "ò"},
189
+    {"Ograve", "Ò"},
190
+    {"oline", "‾"},
191
+    {"omega", "ω"},
192
+    {"Omega", "Ω"},
193
+    {"omicron", "ο"},
194
+    {"Omicron", "Ο"},
195
+    {"oplus", "⊕"},
196
+    {"or", "∨"},
197
+    {"ordf", "ª"},
198
+    {"ordm", "º"},
199
+    {"oslash", "ø"},
200
+    {"Oslash", "Ø"},
201
+    {"otilde", "õ"},
202
+    {"Otilde", "Õ"},
203
+    {"otimes", "⊗"},
204
+    {"ouml", "ö"},
205
+    {"Ouml", "Ö"},
206
+    {"para", "¶"},
207
+    {"part", "∂"},
208
+    {"permil", "‰"},
209
+    {"perp", "⊥"},
210
+    {"phi", "φ"},
211
+    {"Phi", "Φ"},
212
+    {"piv", "ϖ"},
213
+    {"pi", "π"},
214
+    {"Pi", "Π"},
215
+    {"plusmn", "±"},
216
+    {"pound", "£"},
217
+    {"prime", "′"},
218
+    {"Prime", "″"},
219
+    {"prod", "∏"},
220
+    {"prop", "∝"},
221
+    {"psi", "ψ"},
222
+    {"Psi", "Ψ"},
223
+    {"quot", "\""},
224
+    {"radic", "√"},
225
+    {"rang", "〉"},
226
+    {"raquo", "»"},
227
+    {"rarr", "→"},
228
+    {"rArr", "⇒"},
229
+    {"rceil", "⌉"},
230
+    {"rdquo", "”"},
231
+    {"real", "ℜ"},
232
+    {"reg", "®"},
233
+    {"rfloor", "⌋"},
234
+    {"rho", "ρ"},
235
+    {"Rho", "Ρ"},
236
+    {"rlm", "\xE2\x80\x8F"},
237
+    {"rsaquo", "›"},
238
+    {"rsquo", "’"},
239
+    {"sbquo", "‚"},
240
+    {"scaron", "š"},
241
+    {"Scaron", "Š"},
242
+    {"sdot", "⋅"},
243
+    {"sect", "§"},
244
+    {"shy", "\xC2\xAD"},
245
+    {"sigmaf", "ς"},
246
+    {"sigma", "σ"},
247
+    {"Sigma", "Σ"},
248
+    {"sim", "∼"},
249
+    {"spades", "♠"},
250
+    {"sub", "⊂"},
251
+    {"sube", "⊆"},
252
+    {"sum", "∑"},
253
+    {"sup", "⊃"},
254
+    {"sup1", "¹"},
255
+    {"sup2", "²"},
256
+    {"sup3", "³"},
257
+    {"supe", "⊇"},
258
+    {"szlig", "ß"},
259
+    {"tau", "τ"},
260
+    {"Tau", "Τ"},
261
+    {"there4", "∴"},
262
+    {"thetasym", "ϑ"},
263
+    {"theta", "θ"},
264
+    {"Theta", "Θ"},
265
+    {"thinsp", "\xE2\x80\x89"},
266
+    {"thorn", "þ"},
267
+    {"THORN", "Þ"},
268
+    {"tilde", "˜"},
269
+    {"times", "×"},
270
+    {"trade", "™"},
271
+    {"uacute", "ú"},
272
+    {"Uacute", "Ú"},
273
+    {"uarr", "↑"},
274
+    {"uArr", "⇑"},
275
+    {"ucirc", "û"},
276
+    {"Ucirc", "Û"},
277
+    {"ugrave", "ù"},
278
+    {"Ugrave", "Ù"},
279
+    {"uml", "¨"},
280
+    {"upsih", "ϒ"},
281
+    {"upsilon", "υ"},
282
+    {"Upsilon", "Υ"},
283
+    {"uuml", "ü"},
284
+    {"Uuml", "Ü"},
285
+    {"weierp", "℘"},
286
+    {"xi", "ξ"},
287
+    {"Xi", "Ξ"},
288
+    {"yacute", "ý"},
289
+    {"Yacute", "Ý"},
290
+    {"yen", "¥"},
291
+    {"yuml", "ÿ"},
292
+    {"Yuml", "Ÿ"},
293
+    {"zeta", "ζ"},
294
+    {"Zeta", "Ζ"},
295
+    {"zwj", "\xE2\x80\x8D"},
296
+    {"zwnj", "\xE2\x80\x8C"}};
297
+
298
+class parse_exc : public std::exception {
299
+ public:
300
+  parse_exc(std::string msg, std::string file, const char* p, char* buff,
301
+            size_t offset) {
302
+    std::stringstream ss;
303
+    ss << file << " at position " << (offset + (p - buff)) << ": " << msg;
304
+    _msg = ss.str();
305
+  }
306
+  ~parse_exc() throw() {}
307
+
308
+  virtual const char* what() const throw() { return _msg.c_str(); }
309
+
310
+ private:
311
+  std::string _msg;
312
+};
313
+
314
+struct attr_cmp {
315
+  bool operator()(const char* const& a, const char* const& b) const {
316
+    return std::strcmp(a, b) < 0;
317
+  }
318
+};
319
+
320
+struct parser_state {
321
+  parser_state() : s(NONE), hanging(0), off(0) {}
322
+  std::stack<std::string> tagStack;
323
+  state s;
324
+  size_t hanging;
325
+  int64_t off;
326
+};
327
+
328
+typedef std::map<const char*, const char*, attr_cmp> AttrMap;
329
+
330
+struct tag {
331
+  const char* name;
332
+  const char* text;
333
+  AttrMap attrs;
334
+};
335
+
336
+class file {
337
+ public:
338
+  file(const std::string& path);
339
+  ~file();
340
+
341
+  const tag& get() const;
342
+
343
+  bool next();
344
+  size_t level() const;
345
+  void reset();
346
+  parser_state state();
347
+  void set_state(const parser_state& s);
348
+  static std::string decode(const char* str);
349
+  static std::string decode(const std::string& str);
350
+
351
+ private:
352
+  int _file;
353
+  parser_state _s;
354
+  parser_state _prevs;
355
+  char** _buffer;
356
+  char* _c;
357
+  int64_t _lastBytes;
358
+
359
+  const char* _tmp;
360
+  const char* _tmp2;
361
+
362
+  size_t _which;
363
+  std::string _path;
364
+
365
+  int64_t _totReadBef;
366
+  int64_t _lastNewData;
367
+
368
+  tag _ret;
369
+
370
+  static size_t utf8(size_t cp, char* out);
371
+  const char* emptyStr = "";
372
+};
373
+
374
+// _____________________________________________________________________________
375
+inline file::file(const std::string& path)
376
+    : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
377
+  _buffer = new char*[2];
378
+  _buffer[0] = new char[BUFFER_S + 1];
379
+  _buffer[1] = new char[BUFFER_S + 1];
380
+
381
+  reset();
382
+}
383
+
384
+// _____________________________________________________________________________
385
+inline file::~file() {
386
+  delete[] _buffer[0];
387
+  delete[] _buffer[1];
388
+  delete[] _buffer;
389
+  close(_file);
390
+}
391
+
392
+// _____________________________________________________________________________
393
+inline void file::reset() {
394
+  _which = 0;
395
+  _s.s = NONE;
396
+  _s.hanging = 0;
397
+  _totReadBef = 0;
398
+
399
+  if (_file) close(_file);
400
+  _file = open(_path.c_str(), O_RDONLY);
401
+  if (_file < 0)
402
+    throw parse_exc(std::string("could not open file"), _path, 0, 0, 0);
403
+#ifdef __unix__
404
+  posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
405
+#endif
406
+
407
+  _lastBytes = read(_file, _buffer[_which], BUFFER_S);
408
+  _lastNewData = _lastBytes;
409
+  _c = _buffer[_which];
410
+  while (!_s.tagStack.empty()) _s.tagStack.pop();
411
+  _s.tagStack.push("[root]");
412
+  _prevs = _s;
413
+}
414
+
415
+// _____________________________________________________________________________
416
+inline size_t file::level() const { return _s.tagStack.size() - _s.hanging; }
417
+
418
+// _____________________________________________________________________________
419
+inline parser_state file::state() { return _prevs; }
420
+
421
+// _____________________________________________________________________________
422
+inline void file::set_state(const parser_state& s) {
423
+  _s = s;
424
+  _prevs = s;
425
+
426
+  lseek(_file, _s.off, SEEK_SET);
427
+  _totReadBef = _s.off;
428
+  _lastBytes = read(_file, _buffer[_which], BUFFER_S);
429
+  _lastNewData = _lastBytes;
430
+  _c = _buffer[_which];
431
+
432
+  next();
433
+}
434
+
435
+// _____________________________________________________________________________
436
+inline const tag& file::get() const { return _ret; }
437
+
438
+// _____________________________________________________________________________
439
+inline bool file::next() {
440
+  if (!_s.tagStack.size()) return false;
441
+  // avoid too much stack copying
442
+  if (_prevs.tagStack.size() != _s.tagStack.size() ||
443
+      _prevs.tagStack.top() != _s.tagStack.top()) {
444
+    _prevs.tagStack = _s.tagStack;
445
+  }
446
+  _prevs.s = _s.s;
447
+  _prevs.hanging = _s.hanging;
448
+  _prevs.off =
449
+      _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
450
+
451
+  if (_s.hanging) _s.hanging--;
452
+  _ret.name = 0;
453
+  _ret.text = emptyStr;
454
+  _ret.attrs.clear();
455
+  void* i;
456
+  while (_lastBytes) {
457
+    for (; _c - _buffer[_which] < _lastBytes; ++_c) {
458
+      char c = *_c;
459
+      switch (_s.s) {
460
+        case NONE:
461
+          if (std::isspace(c))
462
+            continue;
463
+          else if (c == '<') {
464
+            _s.s = IN_TAG_TENTATIVE;
465
+            continue;
466
+          }
467
+          _s.s = IN_TEXT;
468
+          _ret.name = emptyStr;
469
+          _tmp = _c;
470
+          continue;
471
+
472
+        case IN_TEXT:
473
+          i = memchr(_c, '<', _lastBytes - (_c - _buffer[_which]));
474
+          if (!i) {
475
+            _c = _buffer[_which] + _lastBytes;
476
+            continue;
477
+          }
478
+          _c = (char*)i;
479
+          *_c = 0;
480
+          _ret.text = _tmp;
481
+          _s.s = IN_TAG_TENTATIVE;
482
+          _c++;
483
+          return true;
484
+
485
+        case IN_COMMENT_TENTATIVE:
486
+          if (c == '-') {
487
+            _s.s = IN_COMMENT_TENTATIVE2;
488
+            continue;
489
+          }
490
+          throw parse_exc("Expected comment", _path, _c, _buffer[_which],
491
+                          _prevs.off);
492
+
493
+        case IN_COMMENT_TENTATIVE2:
494
+          if (c == '-') {
495
+            _s.s = IN_COMMENT;
496
+            continue;
497
+          }
498
+          throw parse_exc("Expected comment", _path, _c, _buffer[_which],
499
+                          _prevs.off);
500
+
501
+        case IN_COMMENT_CL_TENTATIVE:
502
+          if (c == '-') {
503
+            _s.s = IN_COMMENT_CL_TENTATIVE2;
504
+            continue;
505
+          }
506
+          _s.s = IN_COMMENT;
507
+          continue;
508
+
509
+        case IN_COMMENT_CL_TENTATIVE2:
510
+          if (c == '>') {
511
+            _s.s = NONE;
512
+            continue;
513
+          }
514
+          _s.s = IN_COMMENT;
515
+        // fall through, we are still in comment
516
+
517
+        case IN_COMMENT:
518
+          i = memchr(_c, '-', _lastBytes - (_c - _buffer[_which]));
519
+          if (!i) {
520
+            _c = _buffer[_which] + _lastBytes;
521
+            continue;
522
+          }
523
+          _c = (char*)i;
524
+          _s.s = IN_COMMENT_CL_TENTATIVE;
525
+          continue;
526
+
527
+        case IN_TAG_TENTATIVE:
528
+          if (c == '/') {
529
+            _s.s = IN_TAG_NAME_CLOSE;
530
+            _tmp = _c + 1;
531
+            continue;
532
+          } else if (c == '?') {
533
+            _s.s = IN_TAG_NAME_META;
534
+            continue;
535
+          } else if (c == '!') {
536
+            _s.s = IN_COMMENT_TENTATIVE;
537
+            continue;
538
+          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
539
+            _s.s = IN_TAG_NAME;
540
+            _ret.name = _c;
541
+            continue;
542
+          }
543
+
544
+        case IN_TAG:
545
+          if (std::isspace(c))
546
+            continue;
547
+          else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
548
+            _s.s = IN_ATTRKEY;
549
+            _tmp = _c;
550
+            continue;
551
+          } else if (c == '/') {
552
+            _s.s = AW_CLOSING;
553
+            continue;
554
+          } else if (c == '>') {
555
+            _s.hanging++;
556
+            _s.tagStack.push(_ret.name);
557
+            _s.s = WS_SKIP;
558
+            continue;
559
+          }
560
+          throw parse_exc("Expected valid tag", _path, _c, _buffer[_which],
561
+                          _prevs.off);
562
+
563
+        case IN_ATTRVAL_SQ:
564
+          i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
565
+          if (!i) {
566
+            _c = _buffer[_which] + _lastBytes;
567
+            continue;
568
+          }
569
+          _c = (char*)i;
570
+          _s.s = IN_TAG;
571
+          *_c = 0;
572
+          _ret.attrs[_tmp] = _tmp2;
573
+          continue;
574
+
575
+        case IN_ATTRVAL_DQ:
576
+          i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
577
+          if (!i) {
578
+            _c = _buffer[_which] + _lastBytes;
579
+            continue;
580
+          }
581
+          _c = (char*)i;
582
+          _s.s = IN_TAG;
583
+          *_c = 0;
584
+          _ret.attrs[_tmp] = _tmp2;
585
+          continue;
586
+
587
+        case AW_IN_ATTRVAL:
588
+          if (std::isspace(c))
589
+            continue;
590
+          else if (c == '\'') {
591
+            _s.s = IN_ATTRVAL_SQ;
592
+            _tmp2 = _c + 1;
593
+            continue;
594
+          } else if (c == '"') {
595
+            _s.s = IN_ATTRVAL_DQ;
596
+            _tmp2 = _c + 1;
597
+            continue;
598
+          }
599
+          throw parse_exc("Expected attribute value", _path, _c,
600
+                          _buffer[_which], _prevs.off);
601
+
602
+        case IN_ATTRKEY:
603
+          if (std::isspace(c)) {
604
+            *_c = 0;
605
+            _s.s = AFTER_ATTRKEY;
606
+            continue;
607
+          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
608
+            continue;
609
+          } else if (c == '=') {
610
+            *_c = 0;
611
+            _s.s = AW_IN_ATTRVAL;
612
+            continue;
613
+          }
614
+
615
+          throw parse_exc("Expected attribute key char or =", _path, _c,
616
+                          _buffer[_which], _prevs.off);
617
+
618
+        case AFTER_ATTRKEY:
619
+          if (std::isspace(c))
620
+            continue;
621
+          else if (c == '=') {
622
+            _s.s = AW_IN_ATTRVAL;
623
+            continue;
624
+          }
625
+          throw parse_exc(
626
+              std::string("Expected attribute value for '") + _tmp + "'.",
627
+              _path, _c, _buffer[_which], _prevs.off);
628
+
629
+        case IN_TAG_NAME:
630
+          if (std::isspace(c)) {
631
+            *_c = 0;
632
+            _s.s = IN_TAG;
633
+            continue;
634
+          } else if (c == '>') {
635
+            *_c = 0;
636
+            _s.hanging++;
637
+            _s.tagStack.push(_ret.name);
638
+            _s.s = WS_SKIP;
639
+            continue;
640
+          } else if (c == '/') {
641
+            *_c = 0;
642
+            _s.s = AW_CLOSING;
643
+            continue;
644
+          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
645
+            continue;
646
+          }
647
+
648
+        case IN_TAG_NAME_META:
649
+          // TODO: read meta tags!
650
+          if (c == '>') {
651
+            _s.s = NONE;
652
+            continue;
653
+          }
654
+
655
+          continue;
656
+
657
+        case IN_TAG_NAME_CLOSE:
658
+          if (std::isspace(c)) {
659
+            *_c = 0;
660
+            _s.s = IN_TAG_CLOSE;
661
+            continue;
662
+          } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
663
+            continue;
664
+          } else if (c == '>') {
665
+            *_c = 0;
666
+            if (_tmp != _s.tagStack.top()) {
667
+              throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
668
+                                  ">', expected close of '<" +
669
+                                  _s.tagStack.top() + ">'.",
670
+                              _path, _c, _buffer[_which], _prevs.off);
671
+            }
672
+            _s.tagStack.pop();
673
+            _s.s = NONE;
674
+            continue;
675
+          }
676
+
677
+        case IN_TAG_CLOSE:
678
+          if (std::isspace(c))
679
+            continue;
680
+          else if (c == '>') {
681
+            if (_tmp != _s.tagStack.top()) {
682
+              throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
683
+                                  ">', expected close of '<" +
684
+                                  _s.tagStack.top() + ">'.",
685
+                              _path, _c, _buffer[_which], _prevs.off);
686
+            }
687
+            _s.tagStack.pop();
688
+            _s.s = NONE;
689
+            continue;
690
+          }
691
+          throw parse_exc("Expected '>'", _path, _c, _buffer[_which],
692
+                          _prevs.off);
693
+
694
+        case AW_CLOSING:
695
+          if (c == '>') {
696
+            _s.s = WS_SKIP;
697
+            continue;
698
+          }
699
+
700
+        case WS_SKIP:
701
+          if (std::isspace(c)) continue;
702
+          _s.s = NONE;
703
+          return true;
704
+      }
705
+    }
706
+
707
+    // buffer ended, read new stuff, but copy remaining if needed
708
+    size_t off = 0;
709
+    if (_s.s == IN_TAG_NAME) {  //|| IN_TAG_NAME_META) {
710
+      off = _lastBytes - (_ret.name - _buffer[_which]);
711
+      memmove(_buffer[!_which], _ret.name, off);
712
+      _ret.name = _buffer[!_which];
713
+    } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
714
+               _s.s == IN_TEXT) {
715
+      off = _lastBytes - (_tmp - _buffer[_which]);
716
+      memmove(_buffer[!_which], _tmp, off);
717
+      _tmp = _buffer[!_which];
718
+    } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
719
+      off = _lastBytes - (_tmp2 - _buffer[_which]);
720
+      memmove(_buffer[!_which], _tmp2, off);
721
+      _tmp2 = _buffer[!_which];
722
+    }
723
+
724
+    assert(off <= BUFFER_S);
725
+
726
+    size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
727
+    if (!readb) break;
728
+    _totReadBef += _lastNewData;
729
+    _which = !_which;
730
+    _lastNewData = readb;
731
+    _lastBytes = _lastNewData + off;
732
+    _c = _buffer[_which] + off;
733
+  }
734
+
735
+  if (_s.tagStack.size()) {
736
+    if (_s.tagStack.top() != "[root]") {
737
+      throw parse_exc("XML tree not complete", _path, _c, _buffer[_which],
738
+                      _prevs.off);
739
+    }
740
+    _s.tagStack.pop();
741
+  }
742
+  _s.s = NONE;
743
+  _ret.name = "[root]";
744
+  return false;
745
+}
746
+
747
+// _____________________________________________________________________________
748
+inline std::string file::decode(const std::string& str) {
749
+  return decode(str.c_str());
750
+}
751
+
752
+// _____________________________________________________________________________
753
+inline std::string file::decode(const char* str) {
754
+  const char* c = strchr(str, '&');
755
+  if (!c) return str;
756
+
757
+  char* decRet = new char[strlen(str) + 1];
758
+  const char* last = str;
759
+  char* dstPt = decRet;
760
+
761
+  for (; c != 0; c = strchr(c + 1, '&')) {
762
+    memcpy(dstPt, last, c - last);
763
+    dstPt += c - last;
764
+    last = c;
765
+
766
+    if (*(c + 1) == '#') {
767
+      uint64_t cp = -1;
768
+      char* tail;
769
+      errno = 0;
770
+      if (*(c + 2) == 'x' || *(c + 2) == 'X')
771
+        cp = strtoul(c + 3, &tail, 16);
772
+      else
773
+        cp = strtoul(c + 2, &tail, 10);
774
+
775
+      if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
776
+        dstPt += utf8(cp, dstPt);
777
+        last = tail + 1;
778
+      }
779
+    } else {
780
+      const char* e = strchr(c, ';');
781
+      if (e) {
782
+        char* ent = new char[e - 1 - c + 1];
783
+        memcpy(ent, c + 1, e - 1 - c);
784
+        ent[e - 1 - c] = 0;
785
+        const auto it = ENTITIES.find(ent);
786
+        if (it != ENTITIES.end()) {
787
+          const char* utf8 = it->second;
788
+          memcpy(dstPt, utf8, strlen(utf8));
789
+          dstPt += strlen(utf8);
790
+          last += strlen(ent) + 2;
791
+        }
792
+        delete[] ent;
793
+      }
794
+    }
795
+  }
796
+
797
+  strcpy(dstPt, last);
798
+  std::string ret(decRet);
799
+  delete[] decRet;
800
+  return ret;
801
+}
802
+
803
+// _____________________________________________________________________________
804
+inline size_t file::utf8(size_t cp, char* out) {
805
+  if (cp <= 0x7F) {
806
+    out[0] = cp & 0x7F;
807
+    return 1;
808
+  } else if (cp <= 0x7FF) {
809
+    out[0] = 0xC0 | (cp >> 6);
810
+    out[1] = 0x80 | (cp & 0x3F);
811
+    return 2;
812
+  } else if (cp <= 0xFFFF) {
813
+    out[0] = 0xE0 | (cp >> 12);
814
+    out[1] = 0x80 | ((cp >> 6) & 0x3F);
815
+    out[2] = 0x80 | (cp & 0x3F);
816
+    return 3;
817
+  } else if (cp <= 0x1FFFFF) {
818
+    out[0] = 0xF0 | (cp >> 18);
819
+    out[1] = 0x80 | ((cp >> 12) & 0x3F);
820
+    out[2] = 0x80 | ((cp >> 6) & 0x3F);
821
+    out[3] = 0x80 | (cp & 0x3F);
822
+    return 4;
823
+  }
824
+
825
+  return 0;
826
+}
827
+}
828
+
829
+#endif  // PFXML_H_