// Copyright 2017 Patrick Brosi // info@patrickbrosi.de #include #include #include #include #include #include #include #include #include #include "xml/File.h" #include "xml/NamedEnts.h" using namespace xml; // _____________________________________________________________________________ File::File(const std::string& path) : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) { _buffer = new char*[2]; _buffer[0] = new char[BUFFER_S + 1]; _buffer[1] = new char[BUFFER_S + 1]; reset(); } // _____________________________________________________________________________ File::~File() { delete[] _buffer[0]; delete[] _buffer[1]; delete[] _buffer; close(_file); } // _____________________________________________________________________________ void File::reset() { _which = 0; _s.s = NONE; _s.hanging = 0; _totReadBef = 0; if (_file) close(_file); _file = open(_path.c_str(), O_RDONLY); posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL); _lastBytes = read(_file, _buffer[_which], BUFFER_S); _lastNewData = _lastBytes; _c = _buffer[_which]; while (!_s.tagStack.empty()) _s.tagStack.pop(); _s.tagStack.push("[root]"); _prevs = _s; } // _____________________________________________________________________________ size_t File::level() const { return _s.tagStack.size() - _s.hanging; } // _____________________________________________________________________________ ParserState File::state() { return _prevs; } // _____________________________________________________________________________ void File::setState(const ParserState& s) { _s = s; _prevs = s; lseek(_file, _s.off, SEEK_SET); _totReadBef = _s.off; _lastBytes = read(_file, _buffer[_which], BUFFER_S); _lastNewData = _lastBytes; _c = _buffer[_which]; next(); } // _____________________________________________________________________________ const Tag& File::get() const { return _ret; } // _____________________________________________________________________________ bool File::next() { // avoid too much stack copying if (_prevs.tagStack.size() != _s.tagStack.size() || _prevs.tagStack.top() != _s.tagStack.top()) { _prevs.tagStack = _s.tagStack; } _prevs.s = _s.s; _prevs.hanging = _s.hanging; _prevs.off = _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData); if (_s.hanging) _s.hanging--; _ret.name = 0; _ret.attrs.clear(); void* i; while (_lastBytes) { for (; _c - _buffer[_which] < _lastBytes; ++_c) { char c = *_c; switch (_s.s) { case NONE: if (std::isspace(c)) continue; else if (c == '<') { _s.s = IN_TAG_TENTATIVE; continue; } else { _s.s = IN_TEXT; continue; } case IN_TEXT: throw XmlFileException("text nodes not yet supported"); case IN_TAG_TENTATIVE: if (c == '/') { _s.s = IN_TAG_NAME_CLOSE; _tmp = _c + 1; continue; } else if (c == '?') { _s.s = IN_TAG_NAME_META; continue; } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') { _s.s = IN_TAG_NAME; _ret.name = _c; continue; } case IN_TAG: if (std::isspace(c)) continue; else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') { _s.s = IN_ATTRKEY; _tmp = _c; continue; } else if (c == '/') { _s.s = AW_CLOSING; continue; } else if (c == '>') { _s.hanging++; _s.tagStack.push(_ret.name); _s.s = WS_SKIP; continue; } else { throw XmlFileException("expected valid tag"); } case IN_ATTRVAL_SQ: i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which])); if (!i) { _c = _buffer[_which] + _lastBytes; continue; } else { _c = (char*)i; _s.s = IN_TAG; *_c = 0; _ret.attrs[_tmp] = _tmp2; continue; } case IN_ATTRVAL_DQ: i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which])); if (!i) { _c = _buffer[_which] + _lastBytes; continue; } else { _c = (char*)i; _s.s = IN_TAG; *_c = 0; _ret.attrs[_tmp] = _tmp2; continue; } case AW_IN_ATTRVAL: if (std::isspace(c)) continue; else if (c == '\'') { _s.s = IN_ATTRVAL_SQ; _tmp2 = _c + 1; continue; } else if (c == '"') { _s.s = IN_ATTRVAL_DQ; _tmp2 = _c + 1; continue; } else { throw XmlFileException("expected attribute value"); } case IN_ATTRKEY: if (std::isspace(c)) { *_c = 0; _s.s = AFTER_ATTRKEY; continue; } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') { continue; } else if (c == '=') { *_c = 0; _s.s = AW_IN_ATTRVAL; continue; } std::cerr << "ERROR 5" << std::endl; exit(0); case AFTER_ATTRKEY: if (std::isspace(c)) continue; else if (c == '=') { _s.s = AW_IN_ATTRVAL; continue; } else { // TODO: error continue; } case IN_TAG_NAME: if (std::isspace(c)) { *_c = 0; _s.s = IN_TAG; continue; } else if (c == '>') { *_c = 0; _s.hanging++; _s.tagStack.push(_ret.name); _s.s = WS_SKIP; continue; } else if (c == '/') { *_c = 0; _s.s = AW_CLOSING; continue; } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') { continue; } case IN_TAG_NAME_META: // TODO: read meta tags! if (c == '>') { _s.s = NONE; continue; } continue; case IN_TAG_NAME_CLOSE: if (std::isspace(c)) { *_c = 0; _s.s = IN_TAG_CLOSE; continue; } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') { continue; } else if (c == '>') { *_c = 0; if (_tmp != _s.tagStack.top()) { throw XmlFileException("closing wrong tag"); } _s.tagStack.pop(); _s.s = NONE; continue; } case IN_TAG_CLOSE: if (std::isspace(c)) continue; else if (c == '>') { if (_tmp != _s.tagStack.top()) { throw XmlFileException("closing wrong tag"); } _s.tagStack.pop(); _s.s = NONE; continue; } else { throw XmlFileException("expected '>'"); } case AW_CLOSING: if (c == '>') { _s.s = WS_SKIP; continue; } case WS_SKIP: if (std::isspace(c)) continue; else { _s.s = NONE; return true; } } } // buffer ended, read new stuff, but copy remaining if needed size_t off = 0; if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) { off = _lastBytes - (_ret.name - _buffer[_which]); memmove(_buffer[!_which], _ret.name, off); _ret.name = _buffer[!_which]; } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY || _s.s == IN_TEXT) { off = _lastBytes - (_tmp - _buffer[_which]); memmove(_buffer[!_which], _tmp, off); _tmp = _buffer[!_which]; } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) { off = _lastBytes - (_tmp2 - _buffer[_which]); memmove(_buffer[!_which], _tmp2, off); _tmp2 = _buffer[!_which]; } assert(off <= BUFFER_S); size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off); if (!readb) break; _totReadBef += _lastNewData; _which = !_which; _lastNewData = readb; _lastBytes = _lastNewData + off; _c = _buffer[_which] + off; } if (_s.tagStack.top() != "[root]") { // TODO error throw XmlFileException("XML tree not complete"); } else { _s.tagStack.pop(); } _s.s = NONE; _ret.name = "[root]"; return false; } // _____________________________________________________________________________ std::string File::decode(const std::string& str) { return decode(str.c_str()); } // _____________________________________________________________________________ std::string File::decode(const char* str) { const char* c = strchr(str, '&'); if (!c) return str; char decRet[strlen(str) + 1]; const char* last = str; char* dstPt = decRet; for (; c != 0; c = strchr(c + 1, '&')) { memcpy(dstPt, last, c - last); dstPt += c - last; last = c; if (*(c + 1) == '#') { uint64_t cp = -1; char* tail; errno = 0; if (*(c + 2) == 'x' || *(c + 2) == 'X') cp = strtoul(c + 3, &tail, 16); else cp = strtoul(c + 2, &tail, 10); if (*tail == ';' && cp <= 0x1FFFFF && !errno) { dstPt += utf8(cp, dstPt); last = tail + 1; } } else { const char* e = strchr(c, ';'); if (e) { char ent[e - 1 - c + 1]; memcpy(ent, c + 1, e - 1 - c); ent[e - 1 - c] = 0; const auto it = xml::ENTITIES.find(ent); if (it != xml::ENTITIES.end()) { const char* utf8 = it->second; memcpy(dstPt, utf8, strlen(utf8)); dstPt += strlen(utf8); last += strlen(ent) + 2; } } } } strcpy(dstPt, last); return decRet; } // _____________________________________________________________________________ size_t File::utf8(size_t cp, char* out) { if (cp <= 0x7F) { out[0] = cp & 0x7F; return 1; } else if (cp <= 0x7FF) { out[0] = 0xC0 | (cp >> 6); out[1] = 0x80 | (cp & 0x3F); return 2; } else if (cp <= 0xFFFF) { out[0] = 0xE0 | (cp >> 12); out[1] = 0x80 | ((cp >> 6) & 0x3F); out[2] = 0x80 | (cp & 0x3F); return 3; } else if (cp <= 0x1FFFFF) { out[0] = 0xF0 | (cp >> 18); out[1] = 0x80 | ((cp >> 12) & 0x3F); out[2] = 0x80 | ((cp >> 6) & 0x3F); out[3] = 0x80 | (cp & 0x3F); return 4; } return 0; }