|  | @@ -14,57 +14,64 @@
 | 
	
		
			
			| 14 | 14 |  #include "osmfixer/index/SearchIdx.h"
 | 
	
		
			
			| 15 | 15 |  
 | 
	
		
			
			| 16 | 16 |  using osmfixer::SearchIdx;
 | 
	
		
			
			| 17 |  | -using osmfixer::TupleList;
 | 
	
		
			
			| 18 | 17 |  using osmfixer::TripleList;
 | 
	
		
			
			|  | 18 | +using osmfixer::TupleList;
 | 
	
		
			
			| 19 | 19 |  
 | 
	
		
			
			| 20 | 20 |  // _____________________________________________________________________________
 | 
	
		
			
			| 21 |  | -void SearchIdx::build() {
 | 
	
		
			
			|  | 21 | +void SearchIdx::build(std::vector<std::pair<IdRange, StatIdx>>& idxs) {
 | 
	
		
			
			| 22 | 22 |    _qGramIndex.clear();
 | 
	
		
			
			| 23 | 23 |  
 | 
	
		
			
			| 24 | 24 |    size_t nameid = 0;
 | 
	
		
			
			| 25 | 25 |  
 | 
	
		
			
			| 26 | 26 |    std::map<std::wstring, size_t> tokenIds;
 | 
	
		
			
			| 27 | 27 |  
 | 
	
		
			
			| 28 |  | -  for (size_t gid = 0; gid < _stats.getGroups().size(); gid++) {
 | 
	
		
			
			| 29 |  | -		auto group = _stats.getGroup(gid);
 | 
	
		
			
			| 30 |  | -
 | 
	
		
			
			| 31 |  | -		// dont index empty groups
 | 
	
		
			
			| 32 |  | -		if (group->stations.size() == 0) continue;
 | 
	
		
			
			| 33 |  | -		if (group->polyStations.size() == 0) continue;
 | 
	
		
			
			| 34 |  | -
 | 
	
		
			
			| 35 |  | -    for (const auto& name : _stats.getGroup(gid)->uniqueNames) {
 | 
	
		
			
			| 36 |  | -      // use wstring to get UTF-8 chars right
 | 
	
		
			
			| 37 |  | -      std::wstring wname =
 | 
	
		
			
			| 38 |  | -          std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(name);
 | 
	
		
			
			| 39 |  | -
 | 
	
		
			
			| 40 |  | -      _nameToGroup[nameid] = gid;
 | 
	
		
			
			| 41 |  | -			_names.push_back(name);
 | 
	
		
			
			| 42 |  | -
 | 
	
		
			
			| 43 |  | -      for (const auto& token : tokenize(wname)) {
 | 
	
		
			
			| 44 |  | -        if (tokenIds.count(token)) {
 | 
	
		
			
			| 45 |  | -          if (_inv[tokenIds.find(token)->second].size() == 0 ||
 | 
	
		
			
			| 46 |  | -              _inv[tokenIds.find(token)->second].back().first != nameid) {
 | 
	
		
			
			| 47 |  | -            // only use a token once per station
 | 
	
		
			
			| 48 |  | -            _inv[tokenIds.find(token)->second].push_back({nameid, 1});
 | 
	
		
			
			|  | 28 | +  for (const auto& idx : idxs) {
 | 
	
		
			
			|  | 29 | +    for (size_t gid = 0; gid < idx.second.getGroups().size(); gid++) {
 | 
	
		
			
			|  | 30 | +      auto group = idx.second.getGroup(gid);
 | 
	
		
			
			|  | 31 | +
 | 
	
		
			
			|  | 32 | +      // slightly prefer larger stations
 | 
	
		
			
			|  | 33 | +      double groupScore = 1.0 + log(group->stations.size()) / 20;
 | 
	
		
			
			|  | 34 | +
 | 
	
		
			
			|  | 35 | +      // dont index empty groups
 | 
	
		
			
			|  | 36 | +      if (group->stations.size() == 0) continue;
 | 
	
		
			
			|  | 37 | +      if (group->polyStations.size() == 0) continue;
 | 
	
		
			
			|  | 38 | +
 | 
	
		
			
			|  | 39 | +      for (const auto& namep : group->uniqueNames) {
 | 
	
		
			
			|  | 40 | +        // use wstring to get UTF-8 chars right
 | 
	
		
			
			|  | 41 | +        const auto& name = namep.second;
 | 
	
		
			
			|  | 42 | +        std::wstring wname =
 | 
	
		
			
			|  | 43 | +            std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(name);
 | 
	
		
			
			|  | 44 | +
 | 
	
		
			
			|  | 45 | +        _nameToGroup[nameid] = idx.first.gidStart + gid;
 | 
	
		
			
			|  | 46 | +        _names.push_back(name);
 | 
	
		
			
			|  | 47 | +
 | 
	
		
			
			|  | 48 | +        for (const auto& token : tokenize(wname)) {
 | 
	
		
			
			|  | 49 | +          if (tokenIds.count(token)) {
 | 
	
		
			
			|  | 50 | +            if (_inv[tokenIds.find(token)->second].size() == 0 ||
 | 
	
		
			
			|  | 51 | +                _inv[tokenIds.find(token)->second].back().first != nameid) {
 | 
	
		
			
			|  | 52 | +              // only use a token once per station
 | 
	
		
			
			|  | 53 | +              _inv[tokenIds.find(token)->second].push_back(
 | 
	
		
			
			|  | 54 | +                  {nameid, groupScore});
 | 
	
		
			
			|  | 55 | +            }
 | 
	
		
			
			|  | 56 | +          } else {
 | 
	
		
			
			|  | 57 | +            tokenIds[token] = _tokens.size();
 | 
	
		
			
			|  | 58 | +            _tokens.push_back(token);
 | 
	
		
			
			|  | 59 | +            _inv.push_back(TupleList());
 | 
	
		
			
			|  | 60 | +            _inv.back().push_back({nameid, groupScore});
 | 
	
		
			
			| 49 | 61 |            }
 | 
	
		
			
			| 50 |  | -        } else {
 | 
	
		
			
			| 51 |  | -          tokenIds[token] = _tokens.size();
 | 
	
		
			
			| 52 |  | -          _tokens.push_back(token);
 | 
	
		
			
			| 53 |  | -          _inv.push_back(TupleList());
 | 
	
		
			
			| 54 |  | -          _inv.back().push_back({nameid, 1});
 | 
	
		
			
			| 55 |  | -        }
 | 
	
		
			
			| 56 | 62 |  
 | 
	
		
			
			| 57 |  | -        size_t tokenId = tokenIds.find(token)->second;
 | 
	
		
			
			| 58 |  | -        for (const auto& qGram : getQGrams(token)) {
 | 
	
		
			
			| 59 |  | -          if (_qGramIndex[qGram].size() &&
 | 
	
		
			
			| 60 |  | -              _qGramIndex[qGram].back().first == tokenId) {
 | 
	
		
			
			| 61 |  | -            _qGramIndex[qGram].back().second++;
 | 
	
		
			
			| 62 |  | -          } else {
 | 
	
		
			
			| 63 |  | -            _qGramIndex[qGram].push_back({tokenId, 1});
 | 
	
		
			
			|  | 63 | +          size_t tokenId = tokenIds.find(token)->second;
 | 
	
		
			
			|  | 64 | +          for (const auto& qGram : getQGrams(token)) {
 | 
	
		
			
			|  | 65 | +            if (_qGramIndex[qGram].size() &&
 | 
	
		
			
			|  | 66 | +                _qGramIndex[qGram].back().first == tokenId) {
 | 
	
		
			
			|  | 67 | +              _qGramIndex[qGram].back().second++;
 | 
	
		
			
			|  | 68 | +            } else {
 | 
	
		
			
			|  | 69 | +              _qGramIndex[qGram].push_back({tokenId, 1});
 | 
	
		
			
			|  | 70 | +            }
 | 
	
		
			
			| 64 | 71 |            }
 | 
	
		
			
			| 65 | 72 |          }
 | 
	
		
			
			|  | 73 | +        nameid++;
 | 
	
		
			
			| 66 | 74 |        }
 | 
	
		
			
			| 67 |  | -      nameid++;
 | 
	
		
			
			| 68 | 75 |      }
 | 
	
		
			
			| 69 | 76 |    }
 | 
	
		
			
			| 70 | 77 |  
 | 
	
	
		
			
			|  | @@ -101,7 +108,7 @@ std::vector<std::wstring> SearchIdx::tokenize(const std::wstring& str) {
 | 
	
		
			
			| 101 | 108 |    std::vector<std::wstring> ret;
 | 
	
		
			
			| 102 | 109 |    std::wstring cur;
 | 
	
		
			
			| 103 | 110 |  
 | 
	
		
			
			| 104 |  | -  const wchar_t* seps = L"_-?'\"|!@#$%^&*()_+}|><.,\\";
 | 
	
		
			
			|  | 111 | +  const wchar_t* seps = L"_-?'\"|!@#$%^&*()_+{}[]|<>.:,\\/";
 | 
	
		
			
			| 105 | 112 |  
 | 
	
		
			
			| 106 | 113 |    for (size_t i = 0; i < str.size(); i++) {
 | 
	
		
			
			| 107 | 114 |      if (std::iswspace(str[i]) || wcschr(seps, str[i])) {
 | 
	
	
		
			
			|  | @@ -147,42 +154,43 @@ TripleList SearchIdx::find(const std::string& qry) const {
 | 
	
		
			
			| 147 | 154 |      double delta = token.size() / 4.0;
 | 
	
		
			
			| 148 | 155 |      auto res = tokenFind(token);
 | 
	
		
			
			| 149 | 156 |  
 | 
	
		
			
			| 150 |  | -    std::partial_sort(res.begin(), res.begin() + std::min<size_t>(100, res.size()), res.end(), resComp);
 | 
	
		
			
			|  | 157 | +    std::partial_sort(res.begin(),
 | 
	
		
			
			|  | 158 | +                      res.begin() + std::min<size_t>(100, res.size()),
 | 
	
		
			
			|  | 159 | +                      res.end(), resComp);
 | 
	
		
			
			| 151 | 160 |  
 | 
	
		
			
			| 152 | 161 |      std::map<size_t, double> bests;
 | 
	
		
			
			| 153 | 162 |      std::map<size_t, size_t> bestToken;
 | 
	
		
			
			| 154 | 163 |  
 | 
	
		
			
			| 155 |  | -		size_t TOPK = 100;
 | 
	
		
			
			|  | 164 | +    size_t TOPK = 100;
 | 
	
		
			
			| 156 | 165 |  
 | 
	
		
			
			| 157 |  | -		// res contains the 100 best token matches
 | 
	
		
			
			|  | 166 | +    // res contains the 100 best token matches
 | 
	
		
			
			| 158 | 167 |      for (size_t i = 0; i < res.size() && i < TOPK; i++) {
 | 
	
		
			
			| 159 | 168 |        for (size_t j = 0; j < _inv[res[i].first].size(); j++) {
 | 
	
		
			
			|  | 169 | +        double score =
 | 
	
		
			
			|  | 170 | +            _inv[res[i].first][j].second * (delta / (1.0 + res[i].second));
 | 
	
		
			
			| 160 | 171 |  
 | 
	
		
			
			| 161 |  | -				double score = _inv[res[i].first][j].second * (delta / (1.0 + res[i].second));
 | 
	
		
			
			| 162 |  | -
 | 
	
		
			
			| 163 |  | -				if (score > bests[_inv[res[i].first][j].first]) {
 | 
	
		
			
			| 164 |  | -						bests[_inv[res[i].first][j].first] = score;
 | 
	
		
			
			| 165 |  | -						bestToken[_inv[res[i].first][j].first] = res[i].first;
 | 
	
		
			
			| 166 |  | -				}
 | 
	
		
			
			|  | 172 | +        if (score > bests[_inv[res[i].first][j].first]) {
 | 
	
		
			
			|  | 173 | +          bests[_inv[res[i].first][j].first] = score;
 | 
	
		
			
			|  | 174 | +          bestToken[_inv[res[i].first][j].first] = res[i].first;
 | 
	
		
			
			|  | 175 | +        }
 | 
	
		
			
			| 167 | 176 |        }
 | 
	
		
			
			| 168 | 177 |      }
 | 
	
		
			
			| 169 | 178 |  
 | 
	
		
			
			| 170 | 179 |      for (size_t i = 0; i < res.size() && i < TOPK; i++) {
 | 
	
		
			
			| 171 |  | -			// inv[res[i]] contains all the names the token res[i] occurs in
 | 
	
		
			
			|  | 180 | +      // inv[res[i]] contains all the names the token res[i] occurs in
 | 
	
		
			
			| 172 | 181 |  
 | 
	
		
			
			| 173 | 182 |        lists.push_back(_inv[res[i].first]);
 | 
	
		
			
			| 174 | 183 |  
 | 
	
		
			
			| 175 | 184 |        // give them a score based on their PED
 | 
	
		
			
			| 176 |  | -			for (size_t j = 0; j < lists.back().size(); j++) {
 | 
	
		
			
			| 177 |  | -				double score = lists.back()[j].second *
 | 
	
		
			
			| 178 |  | -                                 (delta / (1.0 + res[i].second));
 | 
	
		
			
			| 179 |  | -				// best is the token for this name that matched best for the input token
 | 
	
		
			
			| 180 |  | -				size_t best = bestToken[lists.back()[j].first];
 | 
	
		
			
			| 181 |  | -
 | 
	
		
			
			| 182 |  | -				// if it is not this token, we dont count it
 | 
	
		
			
			| 183 |  | -				if (res[i].first != best) score = 0;
 | 
	
		
			
			|  | 185 | +      for (size_t j = 0; j < lists.back().size(); j++) {
 | 
	
		
			
			|  | 186 | +        double score = lists.back()[j].second * (delta / (1.0 + res[i].second));
 | 
	
		
			
			|  | 187 | +        // best is the token for this name that matched best for the input token
 | 
	
		
			
			|  | 188 | +        size_t best = bestToken[lists.back()[j].first];
 | 
	
		
			
			|  | 189 | +
 | 
	
		
			
			|  | 190 | +        // if it is not this token, we dont count it
 | 
	
		
			
			|  | 191 | +        if (res[i].first != best) score = 0;
 | 
	
		
			
			| 184 | 192 |          lists.back()[j].second = score;
 | 
	
		
			
			| 185 |  | -			}
 | 
	
		
			
			|  | 193 | +      }
 | 
	
		
			
			| 186 | 194 |      }
 | 
	
		
			
			| 187 | 195 |    }
 | 
	
		
			
			| 188 | 196 |  
 | 
	
	
		
			
			|  | @@ -201,7 +209,8 @@ TripleList SearchIdx::find(const std::string& qry) const {
 | 
	
		
			
			| 201 | 209 |  
 | 
	
		
			
			| 202 | 210 |    TripleList fin;
 | 
	
		
			
			| 203 | 211 |    for (const auto& r : fr) {
 | 
	
		
			
			| 204 |  | -    if (fin.size() == 0 || fin.back().first.first != r.first.first) fin.push_back(r);
 | 
	
		
			
			|  | 212 | +    if (fin.size() == 0 || fin.back().first.first != r.first.first)
 | 
	
		
			
			|  | 213 | +      fin.push_back(r);
 | 
	
		
			
			| 205 | 214 |    }
 | 
	
		
			
			| 206 | 215 |  
 | 
	
		
			
			| 207 | 216 |    std::partial_sort(fin.begin(), fin.begin() + std::min<size_t>(fin.size(), 10),
 |