123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830 |
- // Copyright 2017 Patrick Brosi
- // info@patrickbrosi.de
- #ifndef PFXML_H_
- #define PFXML_H_
- #include <cstring>
- #include <fstream>
- #include <map>
- #include <sstream>
- #include <stack>
- #include <string>
- namespace pfxml {
- static const size_t BUFFER_S = 16 * 1024;
- enum state {
- NONE,
- IN_TAG_NAME,
- IN_TAG_NAME_META,
- IN_TAG,
- IN_TAG_CLOSE,
- IN_TAG_NAME_CLOSE,
- IN_TAG_TENTATIVE,
- IN_ATTRKEY,
- AFTER_ATTRKEY,
- AW_IN_ATTRVAL,
- IN_ATTRVAL_SQ,
- IN_ATTRVAL_DQ,
- IN_TEXT,
- IN_COMMENT_TENTATIVE,
- IN_COMMENT_TENTATIVE2,
- IN_COMMENT,
- IN_COMMENT_CL_TENTATIVE,
- IN_COMMENT_CL_TENTATIVE2,
- AW_CLOSING,
- WS_SKIP
- };
- // see
- // http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
- static const std::map<std::string, const char*> ENTITIES = {
- {"aacute", "á"},
- {"Aacute", "Á"},
- {"acirc", "â"},
- {"Acirc", "Â"},
- {"acute", "´"},
- {"aelig", "æ"},
- {"AElig", "Æ"},
- {"agrave", "à"},
- {"Agrave", "À"},
- {"alefsym", "ℵ"},
- {"alpha", "α"},
- {"Alpha", "Α"},
- {"amp", "&"},
- {"and", "∧"},
- {"ang", "∠"},
- {"apos", "'"},
- {"aring", "å"},
- {"Aring", "Å"},
- {"asymp", "≈"},
- {"atilde", "ã"},
- {"Atilde", "Ã"},
- {"auml", "ä"},
- {"Auml", "Ä"},
- {"bdquo", "„"},
- {"beta", "β"},
- {"Beta", "Β"},
- {"brvbar", "¦"},
- {"bull", "•"},
- {"cap", "∩"},
- {"ccedil", "ç"},
- {"Ccedil", "Ç"},
- {"cedil", "¸"},
- {"cent", "¢"},
- {"chi", "χ"},
- {"Chi", "Χ"},
- {"circ", "ˆ"},
- {"clubs", "♣"},
- {"cong", "≅"},
- {"copy", "©"},
- {"crarr", "↵"},
- {"cup", "∪"},
- {"curren", "¤"},
- {"dagger", "†"},
- {"Dagger", "‡"},
- {"darr", "↓"},
- {"dArr", "⇓"},
- {"deg", "°"},
- {"delta", "δ"},
- {"Delta", "Δ"},
- {"diams", "♦"},
- {"divide", "÷"},
- {"eacute", "é"},
- {"Eacute", "É"},
- {"ecirc", "ê"},
- {"Ecirc", "Ê"},
- {"egrave", "è"},
- {"Egrave", "È"},
- {"empty", "∅"},
- {"emsp", "\xE2\x80\x83"},
- {"ensp", "\xE2\x80\x82"},
- {"epsilon", "ε"},
- {"Epsilon", "Ε"},
- {"equiv", "≡"},
- {"eta", "η"},
- {"Eta", "Η"},
- {"eth", "ð"},
- {"ETH", "Ð"},
- {"euml", "ë"},
- {"Euml", "Ë"},
- {"euro", "€"},
- {"exist", "∃"},
- {"fnof", "ƒ"},
- {"forall", "∀"},
- {"frac12", "½"},
- {"frac14", "¼"},
- {"frac34", "¾"},
- {"frasl", "⁄"},
- {"gamma", "γ"},
- {"Gamma", "Γ"},
- {"ge", "≥"},
- {"gt", ">"},
- {"harr", "↔"},
- {"hArr", "⇔"},
- {"hearts", "♥"},
- {"hellip", "…"},
- {"iacute", "í"},
- {"Iacute", "Í"},
- {"icirc", "î"},
- {"Icirc", "Î"},
- {"iexcl", "¡"},
- {"igrave", "ì"},
- {"Igrave", "Ì"},
- {"image", "ℑ"},
- {"infin", "∞"},
- {"int", "∫"},
- {"iota", "ι"},
- {"Iota", "Ι"},
- {"iquest", "¿"},
- {"isin", "∈"},
- {"iuml", "ï"},
- {"Iuml", "Ï"},
- {"kappa", "κ"},
- {"Kappa", "Κ"},
- {"lambda", "λ"},
- {"Lambda", "Λ"},
- {"lang", "〈"},
- {"laquo", "«"},
- {"larr", "←"},
- {"lArr", "⇐"},
- {"lceil", "⌈"},
- {"ldquo", "“"},
- {"le", "≤"},
- {"lfloor", "⌊"},
- {"lowast", "∗"},
- {"loz", "◊"},
- {"lrm", "\xE2\x80\x8E"},
- {"lsaquo", "‹"},
- {"lsquo", "‘"},
- {"lt", "<"},
- {"macr", "¯"},
- {"mdash", "—"},
- {"micro", "µ"},
- {"middot", "·"},
- {"minus", "−"},
- {"mu", "μ"},
- {"Mu", "Μ"},
- {"nabla", "∇"},
- {"nbsp", "\xC2\xA0"},
- {"ndash", "–"},
- {"ne", "≠"},
- {"ni", "∋"},
- {"not", "¬"},
- {"notin", "∉"},
- {"nsub", "⊄"},
- {"ntilde", "ñ"},
- {"Ntilde", "Ñ"},
- {"nu", "ν"},
- {"Nu", "Ν"},
- {"oacute", "ó"},
- {"Oacute", "Ó"},
- {"ocirc", "ô"},
- {"Ocirc", "Ô"},
- {"oelig", "œ"},
- {"OElig", "Œ"},
- {"ograve", "ò"},
- {"Ograve", "Ò"},
- {"oline", "‾"},
- {"omega", "ω"},
- {"Omega", "Ω"},
- {"omicron", "ο"},
- {"Omicron", "Ο"},
- {"oplus", "⊕"},
- {"or", "∨"},
- {"ordf", "ª"},
- {"ordm", "º"},
- {"oslash", "ø"},
- {"Oslash", "Ø"},
- {"otilde", "õ"},
- {"Otilde", "Õ"},
- {"otimes", "⊗"},
- {"ouml", "ö"},
- {"Ouml", "Ö"},
- {"para", "¶"},
- {"part", "∂"},
- {"permil", "‰"},
- {"perp", "⊥"},
- {"phi", "φ"},
- {"Phi", "Φ"},
- {"piv", "ϖ"},
- {"pi", "π"},
- {"Pi", "Π"},
- {"plusmn", "±"},
- {"pound", "£"},
- {"prime", "′"},
- {"Prime", "″"},
- {"prod", "∏"},
- {"prop", "∝"},
- {"psi", "ψ"},
- {"Psi", "Ψ"},
- {"quot", "\""},
- {"radic", "√"},
- {"rang", "〉"},
- {"raquo", "»"},
- {"rarr", "→"},
- {"rArr", "⇒"},
- {"rceil", "⌉"},
- {"rdquo", "”"},
- {"real", "ℜ"},
- {"reg", "®"},
- {"rfloor", "⌋"},
- {"rho", "ρ"},
- {"Rho", "Ρ"},
- {"rlm", "\xE2\x80\x8F"},
- {"rsaquo", "›"},
- {"rsquo", "’"},
- {"sbquo", "‚"},
- {"scaron", "š"},
- {"Scaron", "Š"},
- {"sdot", "⋅"},
- {"sect", "§"},
- {"shy", "\xC2\xAD"},
- {"sigmaf", "ς"},
- {"sigma", "σ"},
- {"Sigma", "Σ"},
- {"sim", "∼"},
- {"spades", "♠"},
- {"sub", "⊂"},
- {"sube", "⊆"},
- {"sum", "∑"},
- {"sup", "⊃"},
- {"sup1", "¹"},
- {"sup2", "²"},
- {"sup3", "³"},
- {"supe", "⊇"},
- {"szlig", "ß"},
- {"tau", "τ"},
- {"Tau", "Τ"},
- {"there4", "∴"},
- {"thetasym", "ϑ"},
- {"theta", "θ"},
- {"Theta", "Θ"},
- {"thinsp", "\xE2\x80\x89"},
- {"thorn", "þ"},
- {"THORN", "Þ"},
- {"tilde", "˜"},
- {"times", "×"},
- {"trade", "™"},
- {"uacute", "ú"},
- {"Uacute", "Ú"},
- {"uarr", "↑"},
- {"uArr", "⇑"},
- {"ucirc", "û"},
- {"Ucirc", "Û"},
- {"ugrave", "ù"},
- {"Ugrave", "Ù"},
- {"uml", "¨"},
- {"upsih", "ϒ"},
- {"upsilon", "υ"},
- {"Upsilon", "Υ"},
- {"uuml", "ü"},
- {"Uuml", "Ü"},
- {"weierp", "℘"},
- {"xi", "ξ"},
- {"Xi", "Ξ"},
- {"yacute", "ý"},
- {"Yacute", "Ý"},
- {"yen", "¥"},
- {"yuml", "ÿ"},
- {"Yuml", "Ÿ"},
- {"zeta", "ζ"},
- {"Zeta", "Ζ"},
- {"zwj", "\xE2\x80\x8D"},
- {"zwnj", "\xE2\x80\x8C"}};
- class parse_exc : public std::exception {
- public:
- parse_exc(std::string msg, std::string file, const char* p, char* buff,
- size_t offset) {
- std::stringstream ss;
- ss << file << " at position " << (offset + (p - buff)) << ": " << msg;
- _msg = ss.str();
- }
- ~parse_exc() throw() {}
- virtual const char* what() const throw() { return _msg.c_str(); }
- private:
- std::string _msg;
- };
- struct attr_cmp {
- bool operator()(const char* const& a, const char* const& b) const {
- return std::strcmp(a, b) < 0;
- }
- };
- struct parser_state {
- parser_state() : s(NONE), hanging(0), off(0) {}
- std::stack<std::string> tagStack;
- state s;
- size_t hanging;
- int64_t off;
- };
- typedef std::map<const char*, const char*, attr_cmp> AttrMap;
- struct tag {
- const char* name;
- const char* text;
- AttrMap attrs;
- };
- class file {
- public:
- file(const std::string& path);
- ~file();
- const tag& get() const;
- bool next();
- size_t level() const;
- void reset();
- parser_state state();
- void set_state(const parser_state& s);
- static std::string decode(const char* str);
- static std::string decode(const std::string& str);
- private:
- int _file;
- parser_state _s;
- parser_state _prevs;
- char** _buffer;
- char* _c;
- int64_t _lastBytes;
- const char* _tmp;
- const char* _tmp2;
- size_t _which;
- std::string _path;
- int64_t _totReadBef;
- int64_t _lastNewData;
- tag _ret;
- static size_t utf8(size_t cp, char* out);
- const char* emptyStr = "";
- };
- // _____________________________________________________________________________
- inline file::file(const std::string& path)
- : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
- _buffer = new char*[2];
- _buffer[0] = new char[BUFFER_S + 1];
- _buffer[1] = new char[BUFFER_S + 1];
- reset();
- }
- // _____________________________________________________________________________
- inline file::~file() {
- delete[] _buffer[0];
- delete[] _buffer[1];
- delete[] _buffer;
- close(_file);
- }
- // _____________________________________________________________________________
- inline void file::reset() {
- _which = 0;
- _s.s = NONE;
- _s.hanging = 0;
- _totReadBef = 0;
- if (_file) close(_file);
- _file = open(_path.c_str(), O_RDONLY);
- if (_file < 0)
- throw parse_exc(std::string("could not open file"), _path, 0, 0, 0);
- #ifdef __unix__
- posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
- #endif
- _lastBytes = read(_file, _buffer[_which], BUFFER_S);
- _lastNewData = _lastBytes;
- _c = _buffer[_which];
- while (!_s.tagStack.empty()) _s.tagStack.pop();
- _s.tagStack.push("[root]");
- _prevs = _s;
- }
- // _____________________________________________________________________________
- inline size_t file::level() const { return _s.tagStack.size() - _s.hanging; }
- // _____________________________________________________________________________
- inline parser_state file::state() { return _prevs; }
- // _____________________________________________________________________________
- inline void file::set_state(const parser_state& s) {
- _s = s;
- _prevs = s;
- lseek(_file, _s.off, SEEK_SET);
- _totReadBef = _s.off;
- _lastBytes = read(_file, _buffer[_which], BUFFER_S);
- _lastNewData = _lastBytes;
- _c = _buffer[_which];
- next();
- }
- // _____________________________________________________________________________
- inline const tag& file::get() const { return _ret; }
- // _____________________________________________________________________________
- inline bool file::next() {
- if (!_s.tagStack.size()) return false;
- // avoid too much stack copying
- if (_prevs.tagStack.size() != _s.tagStack.size() ||
- _prevs.tagStack.top() != _s.tagStack.top()) {
- _prevs.tagStack = _s.tagStack;
- }
- _prevs.s = _s.s;
- _prevs.hanging = _s.hanging;
- _prevs.off =
- _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
- if (_s.hanging) _s.hanging--;
- _ret.name = 0;
- _ret.text = emptyStr;
- _ret.attrs.clear();
- void* i;
- while (_lastBytes) {
- for (; _c - _buffer[_which] < _lastBytes; ++_c) {
- char c = *_c;
- switch (_s.s) {
- case NONE:
- if (std::isspace(c))
- continue;
- else if (c == '<') {
- _s.s = IN_TAG_TENTATIVE;
- continue;
- }
- _s.s = IN_TEXT;
- _ret.name = emptyStr;
- _tmp = _c;
- continue;
- case IN_TEXT:
- i = memchr(_c, '<', _lastBytes - (_c - _buffer[_which]));
- if (!i) {
- _c = _buffer[_which] + _lastBytes;
- continue;
- }
- _c = (char*)i;
- *_c = 0;
- _ret.text = _tmp;
- _s.s = IN_TAG_TENTATIVE;
- _c++;
- return true;
- case IN_COMMENT_TENTATIVE:
- if (c == '-') {
- _s.s = IN_COMMENT_TENTATIVE2;
- continue;
- }
- throw parse_exc("Expected comment", _path, _c, _buffer[_which],
- _prevs.off);
- case IN_COMMENT_TENTATIVE2:
- if (c == '-') {
- _s.s = IN_COMMENT;
- continue;
- }
- throw parse_exc("Expected comment", _path, _c, _buffer[_which],
- _prevs.off);
- case IN_COMMENT_CL_TENTATIVE:
- if (c == '-') {
- _s.s = IN_COMMENT_CL_TENTATIVE2;
- continue;
- }
- _s.s = IN_COMMENT;
- continue;
- case IN_COMMENT_CL_TENTATIVE2:
- if (c == '>') {
- _s.s = NONE;
- continue;
- }
- _s.s = IN_COMMENT;
- // fall through, we are still in comment
- case IN_COMMENT:
- i = memchr(_c, '-', _lastBytes - (_c - _buffer[_which]));
- if (!i) {
- _c = _buffer[_which] + _lastBytes;
- continue;
- }
- _c = (char*)i;
- _s.s = IN_COMMENT_CL_TENTATIVE;
- continue;
- case IN_TAG_TENTATIVE:
- if (c == '/') {
- _s.s = IN_TAG_NAME_CLOSE;
- _tmp = _c + 1;
- continue;
- } else if (c == '?') {
- _s.s = IN_TAG_NAME_META;
- continue;
- } else if (c == '!') {
- _s.s = IN_COMMENT_TENTATIVE;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- _s.s = IN_TAG_NAME;
- _ret.name = _c;
- continue;
- }
- case IN_TAG:
- if (std::isspace(c))
- continue;
- else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- _s.s = IN_ATTRKEY;
- _tmp = _c;
- continue;
- } else if (c == '/') {
- _s.s = AW_CLOSING;
- continue;
- } else if (c == '>') {
- _s.hanging++;
- _s.tagStack.push(_ret.name);
- _s.s = WS_SKIP;
- continue;
- }
- throw parse_exc("Expected valid tag", _path, _c, _buffer[_which],
- _prevs.off);
- case IN_ATTRVAL_SQ:
- i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
- if (!i) {
- _c = _buffer[_which] + _lastBytes;
- continue;
- }
- _c = (char*)i;
- _s.s = IN_TAG;
- *_c = 0;
- _ret.attrs[_tmp] = _tmp2;
- continue;
- case IN_ATTRVAL_DQ:
- i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
- if (!i) {
- _c = _buffer[_which] + _lastBytes;
- continue;
- }
- _c = (char*)i;
- _s.s = IN_TAG;
- *_c = 0;
- _ret.attrs[_tmp] = _tmp2;
- continue;
- case AW_IN_ATTRVAL:
- if (std::isspace(c))
- continue;
- else if (c == '\'') {
- _s.s = IN_ATTRVAL_SQ;
- _tmp2 = _c + 1;
- continue;
- } else if (c == '"') {
- _s.s = IN_ATTRVAL_DQ;
- _tmp2 = _c + 1;
- continue;
- }
- throw parse_exc("Expected attribute value", _path, _c,
- _buffer[_which], _prevs.off);
- case IN_ATTRKEY:
- if (std::isspace(c)) {
- *_c = 0;
- _s.s = AFTER_ATTRKEY;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- continue;
- } else if (c == '=') {
- *_c = 0;
- _s.s = AW_IN_ATTRVAL;
- continue;
- }
- throw parse_exc("Expected attribute key char or =", _path, _c,
- _buffer[_which], _prevs.off);
- case AFTER_ATTRKEY:
- if (std::isspace(c))
- continue;
- else if (c == '=') {
- _s.s = AW_IN_ATTRVAL;
- continue;
- }
- throw parse_exc(
- std::string("Expected attribute value for '") + _tmp + "'.",
- _path, _c, _buffer[_which], _prevs.off);
- case IN_TAG_NAME:
- if (std::isspace(c)) {
- *_c = 0;
- _s.s = IN_TAG;
- continue;
- } else if (c == '>') {
- *_c = 0;
- _s.hanging++;
- _s.tagStack.push(_ret.name);
- _s.s = WS_SKIP;
- continue;
- } else if (c == '/') {
- *_c = 0;
- _s.s = AW_CLOSING;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- continue;
- }
- case IN_TAG_NAME_META:
- // TODO: read meta tags!
- if (c == '>') {
- _s.s = NONE;
- continue;
- }
- continue;
- case IN_TAG_NAME_CLOSE:
- if (std::isspace(c)) {
- *_c = 0;
- _s.s = IN_TAG_CLOSE;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- continue;
- } else if (c == '>') {
- *_c = 0;
- if (_tmp != _s.tagStack.top()) {
- throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
- ">', expected close of '<" +
- _s.tagStack.top() + ">'.",
- _path, _c, _buffer[_which], _prevs.off);
- }
- _s.tagStack.pop();
- _s.s = NONE;
- continue;
- }
- case IN_TAG_CLOSE:
- if (std::isspace(c))
- continue;
- else if (c == '>') {
- if (_tmp != _s.tagStack.top()) {
- throw parse_exc(std::string("Closing wrong tag '<") + _tmp +
- ">', expected close of '<" +
- _s.tagStack.top() + ">'.",
- _path, _c, _buffer[_which], _prevs.off);
- }
- _s.tagStack.pop();
- _s.s = NONE;
- continue;
- }
- throw parse_exc("Expected '>'", _path, _c, _buffer[_which],
- _prevs.off);
- case AW_CLOSING:
- if (c == '>') {
- _s.s = WS_SKIP;
- continue;
- }
- case WS_SKIP:
- if (std::isspace(c)) continue;
- _s.s = NONE;
- return true;
- }
- }
- // buffer ended, read new stuff, but copy remaining if needed
- size_t off = 0;
- if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) {
- off = _lastBytes - (_ret.name - _buffer[_which]);
- memmove(_buffer[!_which], _ret.name, off);
- _ret.name = _buffer[!_which];
- } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
- _s.s == IN_TEXT) {
- off = _lastBytes - (_tmp - _buffer[_which]);
- memmove(_buffer[!_which], _tmp, off);
- _tmp = _buffer[!_which];
- } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
- off = _lastBytes - (_tmp2 - _buffer[_which]);
- memmove(_buffer[!_which], _tmp2, off);
- _tmp2 = _buffer[!_which];
- }
- assert(off <= BUFFER_S);
- size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
- if (!readb) break;
- _totReadBef += _lastNewData;
- _which = !_which;
- _lastNewData = readb;
- _lastBytes = _lastNewData + off;
- _c = _buffer[_which] + off;
- }
- if (_s.tagStack.size()) {
- if (_s.tagStack.top() != "[root]") {
- throw parse_exc("XML tree not complete", _path, _c, _buffer[_which],
- _prevs.off);
- }
- _s.tagStack.pop();
- }
- _s.s = NONE;
- _ret.name = "[root]";
- return false;
- }
- // _____________________________________________________________________________
- inline std::string file::decode(const std::string& str) {
- return decode(str.c_str());
- }
- // _____________________________________________________________________________
- inline std::string file::decode(const char* str) {
- const char* c = strchr(str, '&');
- if (!c) return str;
- char* decRet = new char[strlen(str) + 1];
- const char* last = str;
- char* dstPt = decRet;
- for (; c != 0; c = strchr(c + 1, '&')) {
- memcpy(dstPt, last, c - last);
- dstPt += c - last;
- last = c;
- if (*(c + 1) == '#') {
- uint64_t cp = -1;
- char* tail;
- errno = 0;
- if (*(c + 2) == 'x' || *(c + 2) == 'X')
- cp = strtoul(c + 3, &tail, 16);
- else
- cp = strtoul(c + 2, &tail, 10);
- if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
- dstPt += utf8(cp, dstPt);
- last = tail + 1;
- }
- } else {
- const char* e = strchr(c, ';');
- if (e) {
- char* ent = new char[e - 1 - c + 1];
- memcpy(ent, c + 1, e - 1 - c);
- ent[e - 1 - c] = 0;
- const auto it = ENTITIES.find(ent);
- if (it != ENTITIES.end()) {
- const char* utf8 = it->second;
- memcpy(dstPt, utf8, strlen(utf8));
- dstPt += strlen(utf8);
- last += strlen(ent) + 2;
- }
- delete[] ent;
- }
- }
- }
- strcpy(dstPt, last);
- std::string ret(decRet);
- delete[] decRet;
- return ret;
- }
- // _____________________________________________________________________________
- inline size_t file::utf8(size_t cp, char* out) {
- if (cp <= 0x7F) {
- out[0] = cp & 0x7F;
- return 1;
- } else if (cp <= 0x7FF) {
- out[0] = 0xC0 | (cp >> 6);
- out[1] = 0x80 | (cp & 0x3F);
- return 2;
- } else if (cp <= 0xFFFF) {
- out[0] = 0xE0 | (cp >> 12);
- out[1] = 0x80 | ((cp >> 6) & 0x3F);
- out[2] = 0x80 | (cp & 0x3F);
- return 3;
- } else if (cp <= 0x1FFFFF) {
- out[0] = 0xF0 | (cp >> 18);
- out[1] = 0x80 | ((cp >> 12) & 0x3F);
- out[2] = 0x80 | ((cp >> 6) & 0x3F);
- out[3] = 0x80 | (cp & 0x3F);
- return 4;
- }
- return 0;
- }
- }
- #endif // PFXML_H_
|