|
@@ -0,0 +1,829 @@
|
|
1
|
+// Copyright 2017 Patrick Brosi
|
|
2
|
+// info@patrickbrosi.de
|
|
3
|
+
|
|
4
|
+#ifndef PFXML_H_
|
|
5
|
+#define PFXML_H_
|
|
6
|
+
|
|
7
|
+#include <cstring>
|
|
8
|
+#include <fstream>
|
|
9
|
+#include <map>
|
|
10
|
+#include <sstream>
|
|
11
|
+#include <stack>
|
|
12
|
+#include <string>
|
|
13
|
+
|
|
14
|
+namespace pfxml {
|
|
15
|
+
|
|
16
|
+static const size_t BUFFER_S = 16 * 1024;
|
|
17
|
+
|
|
18
|
+enum state {
|
|
19
|
+ NONE,
|
|
20
|
+ IN_TAG_NAME,
|
|
21
|
+ IN_TAG_NAME_META,
|
|
22
|
+ IN_TAG,
|
|
23
|
+ IN_TAG_CLOSE,
|
|
24
|
+ IN_TAG_NAME_CLOSE,
|
|
25
|
+ IN_TAG_TENTATIVE,
|
|
26
|
+ IN_ATTRKEY,
|
|
27
|
+ AFTER_ATTRKEY,
|
|
28
|
+ AW_IN_ATTRVAL,
|
|
29
|
+ IN_ATTRVAL_SQ,
|
|
30
|
+ IN_ATTRVAL_DQ,
|
|
31
|
+ IN_TEXT,
|
|
32
|
+ IN_COMMENT_TENTATIVE,
|
|
33
|
+ IN_COMMENT_TENTATIVE2,
|
|
34
|
+ IN_COMMENT,
|
|
35
|
+ IN_COMMENT_CL_TENTATIVE,
|
|
36
|
+ IN_COMMENT_CL_TENTATIVE2,
|
|
37
|
+ AW_CLOSING,
|
|
38
|
+ WS_SKIP
|
|
39
|
+};
|
|
40
|
+
|
|
41
|
+// see
|
|
42
|
+// http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
|
43
|
+static const std::map<std::string, const char*> ENTITIES = {
|
|
44
|
+ {"aacute", "á"},
|
|
45
|
+ {"Aacute", "Á"},
|
|
46
|
+ {"acirc", "â"},
|
|
47
|
+ {"Acirc", "Â"},
|
|
48
|
+ {"acute", "´"},
|
|
49
|
+ {"aelig", "æ"},
|
|
50
|
+ {"AElig", "Æ"},
|
|
51
|
+ {"agrave", "à"},
|
|
52
|
+ {"Agrave", "À"},
|
|
53
|
+ {"alefsym", "ℵ"},
|
|
54
|
+ {"alpha", "α"},
|
|
55
|
+ {"Alpha", "Α"},
|
|
56
|
+ {"amp", "&"},
|
|
57
|
+ {"and", "∧"},
|
|
58
|
+ {"ang", "∠"},
|
|
59
|
+ {"apos", "'"},
|
|
60
|
+ {"aring", "å"},
|
|
61
|
+ {"Aring", "Å"},
|
|
62
|
+ {"asymp", "≈"},
|
|
63
|
+ {"atilde", "ã"},
|
|
64
|
+ {"Atilde", "Ã"},
|
|
65
|
+ {"auml", "ä"},
|
|
66
|
+ {"Auml", "Ä"},
|
|
67
|
+ {"bdquo", "„"},
|
|
68
|
+ {"beta", "β"},
|
|
69
|
+ {"Beta", "Β"},
|
|
70
|
+ {"brvbar", "¦"},
|
|
71
|
+ {"bull", "•"},
|
|
72
|
+ {"cap", "∩"},
|
|
73
|
+ {"ccedil", "ç"},
|
|
74
|
+ {"Ccedil", "Ç"},
|
|
75
|
+ {"cedil", "¸"},
|
|
76
|
+ {"cent", "¢"},
|
|
77
|
+ {"chi", "χ"},
|
|
78
|
+ {"Chi", "Χ"},
|
|
79
|
+ {"circ", "ˆ"},
|
|
80
|
+ {"clubs", "♣"},
|
|
81
|
+ {"cong", "≅"},
|
|
82
|
+ {"copy", "©"},
|
|
83
|
+ {"crarr", "↵"},
|
|
84
|
+ {"cup", "∪"},
|
|
85
|
+ {"curren", "¤"},
|
|
86
|
+ {"dagger", "†"},
|
|
87
|
+ {"Dagger", "‡"},
|
|
88
|
+ {"darr", "↓"},
|
|
89
|
+ {"dArr", "⇓"},
|
|
90
|
+ {"deg", "°"},
|
|
91
|
+ {"delta", "δ"},
|
|
92
|
+ {"Delta", "Δ"},
|
|
93
|
+ {"diams", "♦"},
|
|
94
|
+ {"divide", "÷"},
|
|
95
|
+ {"eacute", "é"},
|
|
96
|
+ {"Eacute", "É"},
|
|
97
|
+ {"ecirc", "ê"},
|
|
98
|
+ {"Ecirc", "Ê"},
|
|
99
|
+ {"egrave", "è"},
|
|
100
|
+ {"Egrave", "È"},
|
|
101
|
+ {"empty", "∅"},
|
|
102
|
+ {"emsp", "\xE2\x80\x83"},
|
|
103
|
+ {"ensp", "\xE2\x80\x82"},
|
|
104
|
+ {"epsilon", "ε"},
|
|
105
|
+ {"Epsilon", "Ε"},
|
|
106
|
+ {"equiv", "≡"},
|
|
107
|
+ {"eta", "η"},
|
|
108
|
+ {"Eta", "Η"},
|
|
109
|
+ {"eth", "ð"},
|
|
110
|
+ {"ETH", "Ð"},
|
|
111
|
+ {"euml", "ë"},
|
|
112
|
+ {"Euml", "Ë"},
|
|
113
|
+ {"euro", "€"},
|
|
114
|
+ {"exist", "∃"},
|
|
115
|
+ {"fnof", "ƒ"},
|
|
116
|
+ {"forall", "∀"},
|
|
117
|
+ {"frac12", "½"},
|
|
118
|
+ {"frac14", "¼"},
|
|
119
|
+ {"frac34", "¾"},
|
|
120
|
+ {"frasl", "⁄"},
|
|
121
|
+ {"gamma", "γ"},
|
|
122
|
+ {"Gamma", "Γ"},
|
|
123
|
+ {"ge", "≥"},
|
|
124
|
+ {"gt", ">"},
|
|
125
|
+ {"harr", "↔"},
|
|
126
|
+ {"hArr", "⇔"},
|
|
127
|
+ {"hearts", "♥"},
|
|
128
|
+ {"hellip", "…"},
|
|
129
|
+ {"iacute", "í"},
|
|
130
|
+ {"Iacute", "Í"},
|
|
131
|
+ {"icirc", "î"},
|
|
132
|
+ {"Icirc", "Î"},
|
|
133
|
+ {"iexcl", "¡"},
|
|
134
|
+ {"igrave", "ì"},
|
|
135
|
+ {"Igrave", "Ì"},
|
|
136
|
+ {"image", "ℑ"},
|
|
137
|
+ {"infin", "∞"},
|
|
138
|
+ {"int", "∫"},
|
|
139
|
+ {"iota", "ι"},
|
|
140
|
+ {"Iota", "Ι"},
|
|
141
|
+ {"iquest", "¿"},
|
|
142
|
+ {"isin", "∈"},
|
|
143
|
+ {"iuml", "ï"},
|
|
144
|
+ {"Iuml", "Ï"},
|
|
145
|
+ {"kappa", "κ"},
|
|
146
|
+ {"Kappa", "Κ"},
|
|
147
|
+ {"lambda", "λ"},
|
|
148
|
+ {"Lambda", "Λ"},
|
|
149
|
+ {"lang", "〈"},
|
|
150
|
+ {"laquo", "«"},
|
|
151
|
+ {"larr", "←"},
|
|
152
|
+ {"lArr", "⇐"},
|
|
153
|
+ {"lceil", "⌈"},
|
|
154
|
+ {"ldquo", "“"},
|
|
155
|
+ {"le", "≤"},
|
|
156
|
+ {"lfloor", "⌊"},
|
|
157
|
+ {"lowast", "∗"},
|
|
158
|
+ {"loz", "◊"},
|
|
159
|
+ {"lrm", "\xE2\x80\x8E"},
|
|
160
|
+ {"lsaquo", "‹"},
|
|
161
|
+ {"lsquo", "‘"},
|
|
162
|
+ {"lt", "<"},
|
|
163
|
+ {"macr", "¯"},
|
|
164
|
+ {"mdash", "—"},
|
|
165
|
+ {"micro", "µ"},
|
|
166
|
+ {"middot", "·"},
|
|
167
|
+ {"minus", "−"},
|
|
168
|
+ {"mu", "μ"},
|
|
169
|
+ {"Mu", "Μ"},
|
|
170
|
+ {"nabla", "∇"},
|
|
171
|
+ {"nbsp", "\xC2\xA0"},
|
|
172
|
+ {"ndash", "–"},
|
|
173
|
+ {"ne", "≠"},
|
|
174
|
+ {"ni", "∋"},
|
|
175
|
+ {"not", "¬"},
|
|
176
|
+ {"notin", "∉"},
|
|
177
|
+ {"nsub", "⊄"},
|
|
178
|
+ {"ntilde", "ñ"},
|
|
179
|
+ {"Ntilde", "Ñ"},
|
|
180
|
+ {"nu", "ν"},
|
|
181
|
+ {"Nu", "Ν"},
|
|
182
|
+ {"oacute", "ó"},
|
|
183
|
+ {"Oacute", "Ó"},
|
|
184
|
+ {"ocirc", "ô"},
|
|
185
|
+ {"Ocirc", "Ô"},
|
|
186
|
+ {"oelig", "œ"},
|
|
187
|
+ {"OElig", "Œ"},
|
|
188
|
+ {"ograve", "ò"},
|
|
189
|
+ {"Ograve", "Ò"},
|
|
190
|
+ {"oline", "‾"},
|
|
191
|
+ {"omega", "ω"},
|
|
192
|
+ {"Omega", "Ω"},
|
|
193
|
+ {"omicron", "ο"},
|
|
194
|
+ {"Omicron", "Ο"},
|
|
195
|
+ {"oplus", "⊕"},
|
|
196
|
+ {"or", "∨"},
|
|
197
|
+ {"ordf", "ª"},
|
|
198
|
+ {"ordm", "º"},
|
|
199
|
+ {"oslash", "ø"},
|
|
200
|
+ {"Oslash", "Ø"},
|
|
201
|
+ {"otilde", "õ"},
|
|
202
|
+ {"Otilde", "Õ"},
|
|
203
|
+ {"otimes", "⊗"},
|
|
204
|
+ {"ouml", "ö"},
|
|
205
|
+ {"Ouml", "Ö"},
|
|
206
|
+ {"para", "¶"},
|
|
207
|
+ {"part", "∂"},
|
|
208
|
+ {"permil", "‰"},
|
|
209
|
+ {"perp", "⊥"},
|
|
210
|
+ {"phi", "φ"},
|
|
211
|
+ {"Phi", "Φ"},
|
|
212
|
+ {"piv", "ϖ"},
|
|
213
|
+ {"pi", "π"},
|
|
214
|
+ {"Pi", "Π"},
|
|
215
|
+ {"plusmn", "±"},
|
|
216
|
+ {"pound", "£"},
|
|
217
|
+ {"prime", "′"},
|
|
218
|
+ {"Prime", "″"},
|
|
219
|
+ {"prod", "∏"},
|
|
220
|
+ {"prop", "∝"},
|
|
221
|
+ {"psi", "ψ"},
|
|
222
|
+ {"Psi", "Ψ"},
|
|
223
|
+ {"quot", "\""},
|
|
224
|
+ {"radic", "√"},
|
|
225
|
+ {"rang", "〉"},
|
|
226
|
+ {"raquo", "»"},
|
|
227
|
+ {"rarr", "→"},
|
|
228
|
+ {"rArr", "⇒"},
|
|
229
|
+ {"rceil", "⌉"},
|
|
230
|
+ {"rdquo", "”"},
|
|
231
|
+ {"real", "ℜ"},
|
|
232
|
+ {"reg", "®"},
|
|
233
|
+ {"rfloor", "⌋"},
|
|
234
|
+ {"rho", "ρ"},
|
|
235
|
+ {"Rho", "Ρ"},
|
|
236
|
+ {"rlm", "\xE2\x80\x8F"},
|
|
237
|
+ {"rsaquo", "›"},
|
|
238
|
+ {"rsquo", "’"},
|
|
239
|
+ {"sbquo", "‚"},
|
|
240
|
+ {"scaron", "š"},
|
|
241
|
+ {"Scaron", "Š"},
|
|
242
|
+ {"sdot", "⋅"},
|
|
243
|
+ {"sect", "§"},
|
|
244
|
+ {"shy", "\xC2\xAD"},
|
|
245
|
+ {"sigmaf", "ς"},
|
|
246
|
+ {"sigma", "σ"},
|
|
247
|
+ {"Sigma", "Σ"},
|
|
248
|
+ {"sim", "∼"},
|
|
249
|
+ {"spades", "♠"},
|
|
250
|
+ {"sub", "⊂"},
|
|
251
|
+ {"sube", "⊆"},
|
|
252
|
+ {"sum", "∑"},
|
|
253
|
+ {"sup", "⊃"},
|
|
254
|
+ {"sup1", "¹"},
|
|
255
|
+ {"sup2", "²"},
|
|
256
|
+ {"sup3", "³"},
|
|
257
|
+ {"supe", "⊇"},
|
|
258
|
+ {"szlig", "ß"},
|
|
259
|
+ {"tau", "τ"},
|
|
260
|
+ {"Tau", "Τ"},
|
|
261
|
+ {"there4", "∴"},
|
|
262
|
+ {"thetasym", "ϑ"},
|
|
263
|
+ {"theta", "θ"},
|
|
264
|
+ {"Theta", "Θ"},
|
|
265
|
+ {"thinsp", "\xE2\x80\x89"},
|
|
266
|
+ {"thorn", "þ"},
|
|
267
|
+ {"THORN", "Þ"},
|
|
268
|
+ {"tilde", "˜"},
|
|
269
|
+ {"times", "×"},
|
|
270
|
+ {"trade", "™"},
|
|
271
|
+ {"uacute", "ú"},
|
|
272
|
+ {"Uacute", "Ú"},
|
|
273
|
+ {"uarr", "↑"},
|
|
274
|
+ {"uArr", "⇑"},
|
|
275
|
+ {"ucirc", "û"},
|
|
276
|
+ {"Ucirc", "Û"},
|
|
277
|
+ {"ugrave", "ù"},
|
|
278
|
+ {"Ugrave", "Ù"},
|
|
279
|
+ {"uml", "¨"},
|
|
280
|
+ {"upsih", "ϒ"},
|
|
281
|
+ {"upsilon", "υ"},
|
|
282
|
+ {"Upsilon", "Υ"},
|
|
283
|
+ {"uuml", "ü"},
|
|
284
|
+ {"Uuml", "Ü"},
|
|
285
|
+ {"weierp", "℘"},
|
|
286
|
+ {"xi", "ξ"},
|
|
287
|
+ {"Xi", "Ξ"},
|
|
288
|
+ {"yacute", "ý"},
|
|
289
|
+ {"Yacute", "Ý"},
|
|
290
|
+ {"yen", "¥"},
|
|
291
|
+ {"yuml", "ÿ"},
|
|
292
|
+ {"Yuml", "Ÿ"},
|
|
293
|
+ {"zeta", "ζ"},
|
|
294
|
+ {"Zeta", "Ζ"},
|
|
295
|
+ {"zwj", "\xE2\x80\x8D"},
|
|
296
|
+ {"zwnj", "\xE2\x80\x8C"}};
|
|
297
|
+
|
|
298
|
+class parse_exc : public std::exception {
|
|
299
|
+ public:
|
|
300
|
+ parse_exc(std::string msg, std::string file, const char* p, char* buff,
|
|
301
|
+ size_t offset) {
|
|
302
|
+ std::stringstream ss;
|
|
303
|
+ ss << file << " at position " << (offset + (p - buff)) << ": " << msg;
|
|
304
|
+ _msg = ss.str();
|
|
305
|
+ }
|
|
306
|
+ ~parse_exc() throw() {}
|
|
307
|
+
|
|
308
|
+ virtual const char* what() const throw() { return _msg.c_str(); }
|
|
309
|
+
|
|
310
|
+ private:
|
|
311
|
+ std::string _msg;
|
|
312
|
+};
|
|
313
|
+
|
|
314
|
+struct attr_cmp {
|
|
315
|
+ bool operator()(const char* const& a, const char* const& b) const {
|
|
316
|
+ return std::strcmp(a, b) < 0;
|
|
317
|
+ }
|
|
318
|
+};
|
|
319
|
+
|
|
320
|
+struct parser_state {
|
|
321
|
+ parser_state() : s(NONE), hanging(0), off(0) {}
|
|
322
|
+ std::stack<std::string> tagStack;
|
|
323
|
+ state s;
|
|
324
|
+ size_t hanging;
|
|
325
|
+ int64_t off;
|
|
326
|
+};
|
|
327
|
+
|
|
328
|
+typedef std::map<const char*, const char*, attr_cmp> AttrMap;
|
|
329
|
+
|
|
330
|
+struct tag {
|
|
331
|
+ const char* name;
|
|
332
|
+ const char* text;
|
|
333
|
+ AttrMap attrs;
|
|
334
|
+};
|
|
335
|
+
|
|
336
|
+class file {
|
|
337
|
+ public:
|
|
338
|
+ file(const std::string& path);
|
|
339
|
+ ~file();
|
|
340
|
+
|
|
341
|
+ const tag& get() const;
|
|
342
|
+
|
|
343
|
+ bool next();
|
|
344
|
+ size_t level() const;
|
|
345
|
+ void reset();
|
|
346
|
+ parser_state state();
|
|
347
|
+ void set_state(const parser_state& s);
|
|
348
|
+ static std::string decode(const char* str);
|
|
349
|
+ static std::string decode(const std::string& str);
|
|
350
|
+
|
|
351
|
+ private:
|
|
352
|
+ int _file;
|
|
353
|
+ parser_state _s;
|
|
354
|
+ parser_state _prevs;
|
|
355
|
+ char** _buffer;
|
|
356
|
+ char* _c;
|
|
357
|
+ int64_t _lastBytes;
|
|
358
|
+
|
|
359
|
+ const char* _tmp;
|
|
360
|
+ const char* _tmp2;
|
|
361
|
+
|
|
362
|
+ size_t _which;
|
|
363
|
+ std::string _path;
|
|
364
|
+
|
|
365
|
+ int64_t _totReadBef;
|
|
366
|
+ int64_t _lastNewData;
|
|
367
|
+
|
|
368
|
+ tag _ret;
|
|
369
|
+
|
|
370
|
+ static size_t utf8(size_t cp, char* out);
|
|
371
|
+ const char* emptyStr = "";
|
|
372
|
+};
|
|
373
|
+
|
|
374
|
+// _____________________________________________________________________________
|
|
375
|
+inline file::file(const std::string& path)
|
|
376
|
+ : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
|
|
377
|
+ _buffer = new char*[2];
|
|
378
|
+ _buffer[0] = new char[BUFFER_S + 1];
|
|
379
|
+ _buffer[1] = new char[BUFFER_S + 1];
|
|
380
|
+
|
|
381
|
+ reset();
|
|
382
|
+}
|
|
383
|
+
|
|
384
|
+// _____________________________________________________________________________
|
|
385
|
+inline file::~file() {
|
|
386
|
+ delete[] _buffer[0];
|
|
387
|
+ delete[] _buffer[1];
|
|
388
|
+ delete[] _buffer;
|
|
389
|
+ close(_file);
|
|
390
|
+}
|
|
391
|
+
|
|
392
|
+// _____________________________________________________________________________
|
|
393
|
+inline void file::reset() {
|
|
394
|
+ _which = 0;
|
|
395
|
+ _s.s = NONE;
|
|
396
|
+ _s.hanging = 0;
|
|
397
|
+ _totReadBef = 0;
|
|
398
|
+
|
|
399
|
+ if (_file) close(_file);
|
|
400
|
+ _file = open(_path.c_str(), O_RDONLY);
|
|
401
|
+ if (_file < 0)
|
|
402
|
+ throw parse_exc(std::string("could not open file"), _path, 0, 0, 0);
|
|
403
|
+#ifdef __unix__
|
|
404
|
+ posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
|
|
405
|
+#endif
|
|
406
|
+
|
|
407
|
+ _lastBytes = read(_file, _buffer[_which], BUFFER_S);
|
|
408
|
+ _lastNewData = _lastBytes;
|
|
409
|
+ _c = _buffer[_which];
|
|
410
|
+ while (!_s.tagStack.empty()) _s.tagStack.pop();
|
|
411
|
+ _s.tagStack.push("[root]");
|
|
412
|
+ _prevs = _s;
|
|
413
|
+}
|
|
414
|
+
|
|
415
|
+// _____________________________________________________________________________
|
|
416
|
+inline size_t file::level() const { return _s.tagStack.size() - _s.hanging; }
|
|
417
|
+
|
|
418
|
+// _____________________________________________________________________________
|
|
419
|
+inline parser_state file::state() { return _prevs; }
|
|
420
|
+
|
|
421
|
+// _____________________________________________________________________________
|
|
422
|
+inline void file::set_state(const parser_state& s) {
|
|
423
|
+ _s = s;
|
|
424
|
+ _prevs = s;
|
|
425
|
+
|
|
426
|
+ lseek(_file, _s.off, SEEK_SET);
|
|
427
|
+ _totReadBef = _s.off;
|
|
428
|
+ _lastBytes = read(_file, _buffer[_which], BUFFER_S);
|
|
429
|
+ _lastNewData = _lastBytes;
|
|
430
|
+ _c = _buffer[_which];
|
|
431
|
+
|
|
432
|
+ next();
|
|
433
|
+}
|
|
434
|
+
|
|
435
|
+// _____________________________________________________________________________
|
|
436
|
+inline const tag& file::get() const { return _ret; }
|
|
437
|
+
|
|
438
|
+// _____________________________________________________________________________
|
|
439
|
+inline bool file::next() {
|
|
440
|
+ if (!_s.tagStack.size()) return false;
|
|
441
|
+ // avoid too much stack copying
|
|
442
|
+ if (_prevs.tagStack.size() != _s.tagStack.size() ||
|
|
443
|
+ _prevs.tagStack.top() != _s.tagStack.top()) {
|
|
444
|
+ _prevs.tagStack = _s.tagStack;
|
|
445
|
+ }
|
|
446
|
+ _prevs.s = _s.s;
|
|
447
|
+ _prevs.hanging = _s.hanging;
|
|
448
|
+ _prevs.off =
|
|
449
|
+ _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
|
|
450
|
+
|
|
451
|
+ if (_s.hanging) _s.hanging--;
|
|
452
|
+ _ret.name = 0;
|
|
453
|
+ _ret.text = emptyStr;
|
|
454
|
+ _ret.attrs.clear();
|
|
455
|
+ void* i;
|
|
456
|
+ while (_lastBytes) {
|
|
457
|
+ for (; _c - _buffer[_which] < _lastBytes; ++_c) {
|
|
458
|
+ char c = *_c;
|
|
459
|
+ switch (_s.s) {
|
|
460
|
+ case NONE:
|
|
461
|
+ if (std::isspace(c))
|
|
462
|
+ continue;
|
|
463
|
+ else if (c == '<') {
|
|
464
|
+ _s.s = IN_TAG_TENTATIVE;
|
|
465
|
+ continue;
|
|
466
|
+ }
|
|
467
|
+ _s.s = IN_TEXT;
|
|
468
|
+ _ret.name = emptyStr;
|
|
469
|
+ _tmp = _c;
|
|
470
|
+ continue;
|
|
471
|
+
|
|
472
|
+ case IN_TEXT:
|
|
473
|
+ i = memchr(_c, '<', _lastBytes - (_c - _buffer[_which]));
|
|
474
|
+ if (!i) {
|
|
475
|
+ _c = _buffer[_which] + _lastBytes;
|
|
476
|
+ continue;
|
|
477
|
+ }
|
|
478
|
+ _c = (char*)i;
|
|
479
|
+ *_c = 0;
|
|
480
|
+ _ret.text = _tmp;
|
|
481
|
+ _s.s = IN_TAG_TENTATIVE;
|
|
482
|
+ _c++;
|
|
483
|
+ return true;
|
|
484
|
+
|
|
485
|
+ case IN_COMMENT_TENTATIVE:
|
|
486
|
+ if (c == '-') {
|
|
487
|
+ _s.s = IN_COMMENT_TENTATIVE2;
|
|
488
|
+ continue;
|
|
489
|
+ }
|
|
490
|
+ throw parse_exc("Expected comment", _path, _c, _buffer[_which],
|
|
491
|
+ _prevs.off);
|
|
492
|
+
|
|
493
|
+ case IN_COMMENT_TENTATIVE2:
|
|
494
|
+ if (c == '-') {
|
|
495
|
+ _s.s = IN_COMMENT;
|
|
496
|
+ continue;
|
|
497
|
+ }
|
|
498
|
+ throw parse_exc("Expected comment", _path, _c, _buffer[_which],
|
|
499
|
+ _prevs.off);
|
|
500
|
+
|
|
501
|
+ case IN_COMMENT_CL_TENTATIVE:
|
|
502
|
+ if (c == '-') {
|
|
503
|
+ _s.s = IN_COMMENT_CL_TENTATIVE2;
|
|
504
|
+ continue;
|
|
505
|
+ }
|
|
506
|
+ _s.s = IN_COMMENT;
|
|
507
|
+ continue;
|
|
508
|
+
|
|
509
|
+ case IN_COMMENT_CL_TENTATIVE2:
|
|
510
|
+ if (c == '>') {
|
|
511
|
+ _s.s = NONE;
|
|
512
|
+ continue;
|
|
513
|
+ }
|
|
514
|
+ _s.s = IN_COMMENT;
|
|
515
|
+ // fall through, we are still in comment
|
|
516
|
+
|
|
517
|
+ case IN_COMMENT:
|
|
518
|
+ i = memchr(_c, '-', _lastBytes - (_c - _buffer[_which]));
|
|
519
|
+ if (!i) {
|
|
520
|
+ _c = _buffer[_which] + _lastBytes;
|
|
521
|
+ continue;
|
|
522
|
+ }
|
|
523
|
+ _c = (char*)i;
|
|
524
|
+ _s.s = IN_COMMENT_CL_TENTATIVE;
|
|
525
|
+ continue;
|
|
526
|
+
|
|
527
|
+ case IN_TAG_TENTATIVE:
|
|
528
|
+ if (c == '/') {
|
|
529
|
+ _s.s = IN_TAG_NAME_CLOSE;
|
|
530
|
+ _tmp = _c + 1;
|
|
531
|
+ continue;
|
|
532
|
+ } else if (c == '?') {
|
|
533
|
+ _s.s = IN_TAG_NAME_META;
|
|
534
|
+ continue;
|
|
535
|
+ } else if (c == '!') {
|
|
536
|
+ _s.s = IN_COMMENT_TENTATIVE;
|
|
537
|
+ continue;
|
|
538
|
+ } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
|
|
539
|
+ _s.s = IN_TAG_NAME;
|
|
540
|
+ _ret.name = _c;
|
|
541
|
+ continue;
|
|
542
|
+ }
|
|
543
|
+
|
|
544
|
+ case IN_TAG:
|
|
545
|
+ if (std::isspace(c))
|
|
546
|
+ continue;
|
|
547
|
+ else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
|
|
548
|
+ _s.s = IN_ATTRKEY;
|
|
549
|
+ _tmp = _c;
|
|
550
|
+ continue;
|
|
551
|
+ } else if (c == '/') {
|
|
552
|
+ _s.s = AW_CLOSING;
|
|
553
|
+ continue;
|
|
554
|
+ } else if (c == '>') {
|
|
555
|
+ _s.hanging++;
|
|
556
|
+ _s.tagStack.push(_ret.name);
|
|
557
|
+ _s.s = WS_SKIP;
|
|
558
|
+ continue;
|
|
559
|
+ }
|
|
560
|
+ throw parse_exc("Expected valid tag", _path, _c, _buffer[_which],
|
|
561
|
+ _prevs.off);
|
|
562
|
+
|
|
563
|
+ case IN_ATTRVAL_SQ:
|
|
564
|
+ i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
|
|
565
|
+ if (!i) {
|
|
566
|
+ _c = _buffer[_which] + _lastBytes;
|
|
567
|
+ continue;
|
|
568
|
+ }
|
|
569
|
+ _c = (char*)i;
|
|
570
|
+ _s.s = IN_TAG;
|
|
571
|
+ *_c = 0;
|
|
572
|
+ _ret.attrs[_tmp] = _tmp2;
|
|
573
|
+ continue;
|
|
574
|
+
|
|
575
|
+ case IN_ATTRVAL_DQ:
|
|
576
|
+ i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
|
|
577
|
+ if (!i) {
|
|
578
|
+ _c = _buffer[_which] + _lastBytes;
|
|
579
|
+ continue;
|
|
580
|
+ }
|
|
581
|
+ _c = (char*)i;
|
|
582
|
+ _s.s = IN_TAG;
|
|
583
|
+ *_c = 0;
|
|
584
|
+ _ret.attrs[_tmp] = _tmp2;
|
|
585
|
+ continue;
|
|
586
|
+
|
|
587
|
+ case AW_IN_ATTRVAL:
|
|
588
|
+ if (std::isspace(c))
|
|
589
|
+ continue;
|
|
590
|
+ else if (c == '\'') {
|
|
591
|
+ _s.s = IN_ATTRVAL_SQ;
|
|
592
|
+ _tmp2 = _c + 1;
|
|
593
|
+ continue;
|
|
594
|
+ } else if (c == '"') {
|
|
595
|
+ _s.s = IN_ATTRVAL_DQ;
|
|
596
|
+ _tmp2 = _c + 1;
|
|
597
|
+ continue;
|
|
598
|
+ }
|
|
599
|
+ throw parse_exc("Expected attribute value", _path, _c,
|
|
600
|
+ _buffer[_which], _prevs.off);
|
|
601
|
+
|
|
602
|
+ case IN_ATTRKEY:
|
|
603
|
+ if (std::isspace(c)) {
|
|
604
|
+ *_c = 0;
|
|
605
|
+ _s.s = AFTER_ATTRKEY;
|
|
606
|
+ continue;
|
|
607
|
+ } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
|
|
608
|
+ continue;
|
|
609
|
+ } else if (c == '=') {
|
|
610
|
+ *_c = 0;
|
|
611
|
+ _s.s = AW_IN_ATTRVAL;
|
|
612
|
+ continue;
|
|
613
|
+ }
|
|
614
|
+
|
|
615
|
+ throw parse_exc("Expected attribute key char or =", _path, _c,
|
|
616
|
+ _buffer[_which], _prevs.off);
|
|
617
|
+
|
|
618
|
+ case AFTER_ATTRKEY:
|
|
619
|
+ if (std::isspace(c))
|
|
620
|
+ continue;
|
|
621
|
+ else if (c == '=') {
|
|
622
|
+ _s.s = AW_IN_ATTRVAL;
|
|
623
|
+ continue;
|
|
624
|
+ }
|
|
625
|
+ throw parse_exc(
|
|
626
|
+ std::string("Expected attribute value for '") + _tmp + "'.",
|
|
627
|
+ _path, _c, _buffer[_which], _prevs.off);
|
|
628
|
+
|
|
629
|
+ case IN_TAG_NAME:
|
|
630
|
+ if (std::isspace(c)) {
|
|
631
|
+ *_c = 0;
|
|
632
|
+ _s.s = IN_TAG;
|
|
633
|
+ continue;
|
|
634
|
+ } else if (c == '>') {
|
|
635
|
+ *_c = 0;
|
|
636
|
+ _s.hanging++;
|
|
637
|
+ _s.tagStack.push(_ret.name);
|
|
638
|
+ _s.s = WS_SKIP;
|
|
639
|
+ continue;
|
|
640
|
+ } else if (c == '/') {
|
|
641
|
+ *_c = 0;
|
|
642
|
+ _s.s = AW_CLOSING;
|
|
643
|
+ continue;
|
|
644
|
+ } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
|
|
645
|
+ continue;
|
|
646
|
+ }
|
|
647
|
+
|
|
648
|
+ case IN_TAG_NAME_META:
|
|
649
|
+ // TODO: read meta tags!
|
|
650
|
+ if (c == '>') {
|
|
651
|
+ _s.s = NONE;
|
|
652
|
+ continue;
|
|
653
|
+ }
|
|
654
|
+
|
|
655
|
+ continue;
|
|
656
|
+
|
|
657
|
+ case IN_TAG_NAME_CLOSE:
|
|
658
|
+ if (std::isspace(c)) {
|
|
659
|
+ *_c = 0;
|
|
660
|
+ _s.s = IN_TAG_CLOSE;
|
|
661
|
+ continue;
|
|
662
|
+ } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
|
|
663
|
+ continue;
|
|
664
|
+ } else if (c == '>') {
|
|
665
|
+ *_c = 0;
|
|
666
|
+ if (_tmp != _s.tagStack.top()) {
|
|
667
|
+ throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
|
|
668
|
+ ">', expected close of '<" +
|
|
669
|
+ _s.tagStack.top() + ">'.",
|
|
670
|
+ _path, _c, _buffer[_which], _prevs.off);
|
|
671
|
+ }
|
|
672
|
+ _s.tagStack.pop();
|
|
673
|
+ _s.s = NONE;
|
|
674
|
+ continue;
|
|
675
|
+ }
|
|
676
|
+
|
|
677
|
+ case IN_TAG_CLOSE:
|
|
678
|
+ if (std::isspace(c))
|
|
679
|
+ continue;
|
|
680
|
+ else if (c == '>') {
|
|
681
|
+ if (_tmp != _s.tagStack.top()) {
|
|
682
|
+ throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
|
|
683
|
+ ">', expected close of '<" +
|
|
684
|
+ _s.tagStack.top() + ">'.",
|
|
685
|
+ _path, _c, _buffer[_which], _prevs.off);
|
|
686
|
+ }
|
|
687
|
+ _s.tagStack.pop();
|
|
688
|
+ _s.s = NONE;
|
|
689
|
+ continue;
|
|
690
|
+ }
|
|
691
|
+ throw parse_exc("Expected '>'", _path, _c, _buffer[_which],
|
|
692
|
+ _prevs.off);
|
|
693
|
+
|
|
694
|
+ case AW_CLOSING:
|
|
695
|
+ if (c == '>') {
|
|
696
|
+ _s.s = WS_SKIP;
|
|
697
|
+ continue;
|
|
698
|
+ }
|
|
699
|
+
|
|
700
|
+ case WS_SKIP:
|
|
701
|
+ if (std::isspace(c)) continue;
|
|
702
|
+ _s.s = NONE;
|
|
703
|
+ return true;
|
|
704
|
+ }
|
|
705
|
+ }
|
|
706
|
+
|
|
707
|
+ // buffer ended, read new stuff, but copy remaining if needed
|
|
708
|
+ size_t off = 0;
|
|
709
|
+ if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) {
|
|
710
|
+ off = _lastBytes - (_ret.name - _buffer[_which]);
|
|
711
|
+ memmove(_buffer[!_which], _ret.name, off);
|
|
712
|
+ _ret.name = _buffer[!_which];
|
|
713
|
+ } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
|
|
714
|
+ _s.s == IN_TEXT) {
|
|
715
|
+ off = _lastBytes - (_tmp - _buffer[_which]);
|
|
716
|
+ memmove(_buffer[!_which], _tmp, off);
|
|
717
|
+ _tmp = _buffer[!_which];
|
|
718
|
+ } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
|
|
719
|
+ off = _lastBytes - (_tmp2 - _buffer[_which]);
|
|
720
|
+ memmove(_buffer[!_which], _tmp2, off);
|
|
721
|
+ _tmp2 = _buffer[!_which];
|
|
722
|
+ }
|
|
723
|
+
|
|
724
|
+ assert(off <= BUFFER_S);
|
|
725
|
+
|
|
726
|
+ size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
|
|
727
|
+ if (!readb) break;
|
|
728
|
+ _totReadBef += _lastNewData;
|
|
729
|
+ _which = !_which;
|
|
730
|
+ _lastNewData = readb;
|
|
731
|
+ _lastBytes = _lastNewData + off;
|
|
732
|
+ _c = _buffer[_which] + off;
|
|
733
|
+ }
|
|
734
|
+
|
|
735
|
+ if (_s.tagStack.size()) {
|
|
736
|
+ if (_s.tagStack.top() != "[root]") {
|
|
737
|
+ throw parse_exc("XML tree not complete", _path, _c, _buffer[_which],
|
|
738
|
+ _prevs.off);
|
|
739
|
+ }
|
|
740
|
+ _s.tagStack.pop();
|
|
741
|
+ }
|
|
742
|
+ _s.s = NONE;
|
|
743
|
+ _ret.name = "[root]";
|
|
744
|
+ return false;
|
|
745
|
+}
|
|
746
|
+
|
|
747
|
+// _____________________________________________________________________________
|
|
748
|
+inline std::string file::decode(const std::string& str) {
|
|
749
|
+ return decode(str.c_str());
|
|
750
|
+}
|
|
751
|
+
|
|
752
|
+// _____________________________________________________________________________
|
|
753
|
+inline std::string file::decode(const char* str) {
|
|
754
|
+ const char* c = strchr(str, '&');
|
|
755
|
+ if (!c) return str;
|
|
756
|
+
|
|
757
|
+ char* decRet = new char[strlen(str) + 1];
|
|
758
|
+ const char* last = str;
|
|
759
|
+ char* dstPt = decRet;
|
|
760
|
+
|
|
761
|
+ for (; c != 0; c = strchr(c + 1, '&')) {
|
|
762
|
+ memcpy(dstPt, last, c - last);
|
|
763
|
+ dstPt += c - last;
|
|
764
|
+ last = c;
|
|
765
|
+
|
|
766
|
+ if (*(c + 1) == '#') {
|
|
767
|
+ uint64_t cp = -1;
|
|
768
|
+ char* tail;
|
|
769
|
+ errno = 0;
|
|
770
|
+ if (*(c + 2) == 'x' || *(c + 2) == 'X')
|
|
771
|
+ cp = strtoul(c + 3, &tail, 16);
|
|
772
|
+ else
|
|
773
|
+ cp = strtoul(c + 2, &tail, 10);
|
|
774
|
+
|
|
775
|
+ if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
|
|
776
|
+ dstPt += utf8(cp, dstPt);
|
|
777
|
+ last = tail + 1;
|
|
778
|
+ }
|
|
779
|
+ } else {
|
|
780
|
+ const char* e = strchr(c, ';');
|
|
781
|
+ if (e) {
|
|
782
|
+ char* ent = new char[e - 1 - c + 1];
|
|
783
|
+ memcpy(ent, c + 1, e - 1 - c);
|
|
784
|
+ ent[e - 1 - c] = 0;
|
|
785
|
+ const auto it = ENTITIES.find(ent);
|
|
786
|
+ if (it != ENTITIES.end()) {
|
|
787
|
+ const char* utf8 = it->second;
|
|
788
|
+ memcpy(dstPt, utf8, strlen(utf8));
|
|
789
|
+ dstPt += strlen(utf8);
|
|
790
|
+ last += strlen(ent) + 2;
|
|
791
|
+ }
|
|
792
|
+ delete[] ent;
|
|
793
|
+ }
|
|
794
|
+ }
|
|
795
|
+ }
|
|
796
|
+
|
|
797
|
+ strcpy(dstPt, last);
|
|
798
|
+ std::string ret(decRet);
|
|
799
|
+ delete[] decRet;
|
|
800
|
+ return ret;
|
|
801
|
+}
|
|
802
|
+
|
|
803
|
+// _____________________________________________________________________________
|
|
804
|
+inline size_t file::utf8(size_t cp, char* out) {
|
|
805
|
+ if (cp <= 0x7F) {
|
|
806
|
+ out[0] = cp & 0x7F;
|
|
807
|
+ return 1;
|
|
808
|
+ } else if (cp <= 0x7FF) {
|
|
809
|
+ out[0] = 0xC0 | (cp >> 6);
|
|
810
|
+ out[1] = 0x80 | (cp & 0x3F);
|
|
811
|
+ return 2;
|
|
812
|
+ } else if (cp <= 0xFFFF) {
|
|
813
|
+ out[0] = 0xE0 | (cp >> 12);
|
|
814
|
+ out[1] = 0x80 | ((cp >> 6) & 0x3F);
|
|
815
|
+ out[2] = 0x80 | (cp & 0x3F);
|
|
816
|
+ return 3;
|
|
817
|
+ } else if (cp <= 0x1FFFFF) {
|
|
818
|
+ out[0] = 0xF0 | (cp >> 18);
|
|
819
|
+ out[1] = 0x80 | ((cp >> 12) & 0x3F);
|
|
820
|
+ out[2] = 0x80 | ((cp >> 6) & 0x3F);
|
|
821
|
+ out[3] = 0x80 | (cp & 0x3F);
|
|
822
|
+ return 4;
|
|
823
|
+ }
|
|
824
|
+
|
|
825
|
+ return 0;
|
|
826
|
+}
|
|
827
|
+}
|
|
828
|
+
|
|
829
|
+#endif // PFXML_H_
|