123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403 |
- // Copyright 2017 Patrick Brosi
- // info@patrickbrosi.de
- #include <fcntl.h>
- #include <sys/stat.h>
- #include <sys/types.h>
- #include <unistd.h>
- #include <cassert>
- #include <cstring>
- #include <fstream>
- #include <iostream>
- #include <map>
- #include "xml/File.h"
- #include "xml/NamedEnts.h"
- using namespace xml;
- // _____________________________________________________________________________
- File::File(const std::string& path)
- : _file(0), _c(0), _lastBytes(0), _which(0), _path(path), _totReadBef(0) {
- _buffer = new char*[2];
- _buffer[0] = new char[BUFFER_S + 1];
- _buffer[1] = new char[BUFFER_S + 1];
- reset();
- }
- // _____________________________________________________________________________
- File::~File() {
- delete[] _buffer[0];
- delete[] _buffer[1];
- delete[] _buffer;
- close(_file);
- }
- // _____________________________________________________________________________
- void File::reset() {
- _which = 0;
- _s.s = NONE;
- _s.hanging = 0;
- _totReadBef = 0;
- if (_file) close(_file);
- _file = open(_path.c_str(), O_RDONLY);
- if (_file < 0) throw XmlFileException(std::string("could not open ") + _path);
- posix_fadvise(_file, 0, 0, POSIX_FADV_SEQUENTIAL);
- _lastBytes = read(_file, _buffer[_which], BUFFER_S);
- _lastNewData = _lastBytes;
- _c = _buffer[_which];
- while (!_s.tagStack.empty()) _s.tagStack.pop();
- _s.tagStack.push("[root]");
- _prevs = _s;
- }
- // _____________________________________________________________________________
- size_t File::level() const { return _s.tagStack.size() - _s.hanging; }
- // _____________________________________________________________________________
- ParserState File::state() { return _prevs; }
- // _____________________________________________________________________________
- void File::setState(const ParserState& s) {
- _s = s;
- _prevs = s;
- lseek(_file, _s.off, SEEK_SET);
- _totReadBef = _s.off;
- _lastBytes = read(_file, _buffer[_which], BUFFER_S);
- _lastNewData = _lastBytes;
- _c = _buffer[_which];
- next();
- }
- // _____________________________________________________________________________
- const Tag& File::get() const { return _ret; }
- // _____________________________________________________________________________
- bool File::next() {
- if (!_s.tagStack.size()) return false;
- // avoid too much stack copying
- if (_prevs.tagStack.size() != _s.tagStack.size() ||
- _prevs.tagStack.top() != _s.tagStack.top()) {
- _prevs.tagStack = _s.tagStack;
- }
- _prevs.s = _s.s;
- _prevs.hanging = _s.hanging;
- _prevs.off =
- _totReadBef + (_c - _buffer[_which]) - (_lastBytes - _lastNewData);
- if (_s.hanging) _s.hanging--;
- _ret.name = 0;
- _ret.attrs.clear();
- void* i;
- while (_lastBytes) {
- for (; _c - _buffer[_which] < _lastBytes; ++_c) {
- char c = *_c;
- switch (_s.s) {
- case NONE:
- if (std::isspace(c))
- continue;
- else if (c == '<') {
- _s.s = IN_TAG_TENTATIVE;
- continue;
- } else {
- _s.s = IN_TEXT;
- continue;
- }
- case IN_TEXT:
- throw XmlFileException("text nodes not yet supported");
- case IN_TAG_TENTATIVE:
- if (c == '/') {
- _s.s = IN_TAG_NAME_CLOSE;
- _tmp = _c + 1;
- continue;
- } else if (c == '?') {
- _s.s = IN_TAG_NAME_META;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- _s.s = IN_TAG_NAME;
- _ret.name = _c;
- continue;
- }
- case IN_TAG:
- if (std::isspace(c))
- continue;
- else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- _s.s = IN_ATTRKEY;
- _tmp = _c;
- continue;
- } else if (c == '/') {
- _s.s = AW_CLOSING;
- continue;
- } else if (c == '>') {
- _s.hanging++;
- _s.tagStack.push(_ret.name);
- _s.s = WS_SKIP;
- continue;
- } else {
- throw XmlFileException("expected valid tag");
- }
- case IN_ATTRVAL_SQ:
- i = memchr(_c, '\'', _lastBytes - (_c - _buffer[_which]));
- if (!i) {
- _c = _buffer[_which] + _lastBytes;
- continue;
- } else {
- _c = (char*)i;
- _s.s = IN_TAG;
- *_c = 0;
- _ret.attrs[_tmp] = _tmp2;
- continue;
- }
- case IN_ATTRVAL_DQ:
- i = memchr(_c, '"', _lastBytes - (_c - _buffer[_which]));
- if (!i) {
- _c = _buffer[_which] + _lastBytes;
- continue;
- } else {
- _c = (char*)i;
- _s.s = IN_TAG;
- *_c = 0;
- _ret.attrs[_tmp] = _tmp2;
- continue;
- }
- case AW_IN_ATTRVAL:
- if (std::isspace(c))
- continue;
- else if (c == '\'') {
- _s.s = IN_ATTRVAL_SQ;
- _tmp2 = _c + 1;
- continue;
- } else if (c == '"') {
- _s.s = IN_ATTRVAL_DQ;
- _tmp2 = _c + 1;
- continue;
- } else {
- throw XmlFileException("expected attribute value");
- }
- case IN_ATTRKEY:
- if (std::isspace(c)) {
- *_c = 0;
- _s.s = AFTER_ATTRKEY;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- continue;
- } else if (c == '=') {
- *_c = 0;
- _s.s = AW_IN_ATTRVAL;
- continue;
- }
- throw XmlFileException("expected attribute key char or =");
- case AFTER_ATTRKEY:
- if (std::isspace(c))
- continue;
- else if (c == '=') {
- _s.s = AW_IN_ATTRVAL;
- continue;
- } else {
- // TODO: error
- continue;
- }
- case IN_TAG_NAME:
- if (std::isspace(c)) {
- *_c = 0;
- _s.s = IN_TAG;
- continue;
- } else if (c == '>') {
- *_c = 0;
- _s.hanging++;
- _s.tagStack.push(_ret.name);
- _s.s = WS_SKIP;
- continue;
- } else if (c == '/') {
- *_c = 0;
- _s.s = AW_CLOSING;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- continue;
- }
- case IN_TAG_NAME_META:
- // TODO: read meta tags!
- if (c == '>') {
- _s.s = NONE;
- continue;
- }
- continue;
- case IN_TAG_NAME_CLOSE:
- if (std::isspace(c)) {
- *_c = 0;
- _s.s = IN_TAG_CLOSE;
- continue;
- } else if (std::isalnum(c) || c == '-' || c == '_' || c == '.') {
- continue;
- } else if (c == '>') {
- *_c = 0;
- if (_tmp != _s.tagStack.top()) {
- throw XmlFileException("closing wrong tag");
- }
- _s.tagStack.pop();
- _s.s = NONE;
- continue;
- }
- case IN_TAG_CLOSE:
- if (std::isspace(c))
- continue;
- else if (c == '>') {
- if (_tmp != _s.tagStack.top()) {
- throw XmlFileException("closing wrong tag");
- }
- _s.tagStack.pop();
- _s.s = NONE;
- continue;
- } else {
- throw XmlFileException("expected '>'");
- }
- case AW_CLOSING:
- if (c == '>') {
- _s.s = WS_SKIP;
- continue;
- }
- case WS_SKIP:
- if (std::isspace(c))
- continue;
- else {
- _s.s = NONE;
- return true;
- }
- }
- }
- // buffer ended, read new stuff, but copy remaining if needed
- size_t off = 0;
- if (_s.s == IN_TAG_NAME) { //|| IN_TAG_NAME_META) {
- off = _lastBytes - (_ret.name - _buffer[_which]);
- memmove(_buffer[!_which], _ret.name, off);
- _ret.name = _buffer[!_which];
- } else if (_s.s == IN_TAG_NAME_CLOSE || _s.s == IN_ATTRKEY ||
- _s.s == IN_TEXT) {
- off = _lastBytes - (_tmp - _buffer[_which]);
- memmove(_buffer[!_which], _tmp, off);
- _tmp = _buffer[!_which];
- } else if (_s.s == IN_ATTRVAL_SQ || _s.s == IN_ATTRVAL_DQ) {
- off = _lastBytes - (_tmp2 - _buffer[_which]);
- memmove(_buffer[!_which], _tmp2, off);
- _tmp2 = _buffer[!_which];
- }
- assert(off <= BUFFER_S);
- size_t readb = read(_file, _buffer[!_which] + off, BUFFER_S - off);
- if (!readb) break;
- _totReadBef += _lastNewData;
- _which = !_which;
- _lastNewData = readb;
- _lastBytes = _lastNewData + off;
- _c = _buffer[_which] + off;
- }
- if (_s.tagStack.size()) {
- if (_s.tagStack.top() != "[root]") {
- // TODO error
- throw XmlFileException("XML tree not complete");
- } else {
- _s.tagStack.pop();
- }
- }
- _s.s = NONE;
- _ret.name = "[root]";
- return false;
- }
- // _____________________________________________________________________________
- std::string File::decode(const std::string& str) { return decode(str.c_str()); }
- // _____________________________________________________________________________
- std::string File::decode(const char* str) {
- const char* c = strchr(str, '&');
- if (!c) return str;
- char decRet[strlen(str) + 1];
- const char* last = str;
- char* dstPt = decRet;
- for (; c != 0; c = strchr(c + 1, '&')) {
- memcpy(dstPt, last, c - last);
- dstPt += c - last;
- last = c;
- if (*(c + 1) == '#') {
- uint64_t cp = -1;
- char* tail;
- errno = 0;
- if (*(c + 2) == 'x' || *(c + 2) == 'X')
- cp = strtoul(c + 3, &tail, 16);
- else
- cp = strtoul(c + 2, &tail, 10);
- if (*tail == ';' && cp <= 0x1FFFFF && !errno) {
- dstPt += utf8(cp, dstPt);
- last = tail + 1;
- }
- } else {
- const char* e = strchr(c, ';');
- if (e) {
- char ent[e - 1 - c + 1];
- memcpy(ent, c + 1, e - 1 - c);
- ent[e - 1 - c] = 0;
- const auto it = xml::ENTITIES.find(ent);
- if (it != xml::ENTITIES.end()) {
- const char* utf8 = it->second;
- memcpy(dstPt, utf8, strlen(utf8));
- dstPt += strlen(utf8);
- last += strlen(ent) + 2;
- }
- }
- }
- }
- strcpy(dstPt, last);
- return decRet;
- }
- // _____________________________________________________________________________
- size_t File::utf8(size_t cp, char* out) {
- if (cp <= 0x7F) {
- out[0] = cp & 0x7F;
- return 1;
- } else if (cp <= 0x7FF) {
- out[0] = 0xC0 | (cp >> 6);
- out[1] = 0x80 | (cp & 0x3F);
- return 2;
- } else if (cp <= 0xFFFF) {
- out[0] = 0xE0 | (cp >> 12);
- out[1] = 0x80 | ((cp >> 6) & 0x3F);
- out[2] = 0x80 | (cp & 0x3F);
- return 3;
- } else if (cp <= 0x1FFFFF) {
- out[0] = 0xF0 | (cp >> 18);
- out[1] = 0x80 | ((cp >> 12) & 0x3F);
- out[2] = 0x80 | ((cp >> 6) & 0x3F);
- out[3] = 0x80 | (cp & 0x3F);
- return 4;
- }
- return 0;
- }
|