|
@@ -14,57 +14,64 @@
|
14
|
14
|
#include "osmfixer/index/SearchIdx.h"
|
15
|
15
|
|
16
|
16
|
using osmfixer::SearchIdx;
|
17
|
|
-using osmfixer::TupleList;
|
18
|
17
|
using osmfixer::TripleList;
|
|
18
|
+using osmfixer::TupleList;
|
19
|
19
|
|
20
|
20
|
// _____________________________________________________________________________
|
21
|
|
-void SearchIdx::build() {
|
|
21
|
+void SearchIdx::build(std::vector<std::pair<IdRange, StatIdx>>& idxs) {
|
22
|
22
|
_qGramIndex.clear();
|
23
|
23
|
|
24
|
24
|
size_t nameid = 0;
|
25
|
25
|
|
26
|
26
|
std::map<std::wstring, size_t> tokenIds;
|
27
|
27
|
|
28
|
|
- for (size_t gid = 0; gid < _stats.getGroups().size(); gid++) {
|
29
|
|
- auto group = _stats.getGroup(gid);
|
30
|
|
-
|
31
|
|
- // dont index empty groups
|
32
|
|
- if (group->stations.size() == 0) continue;
|
33
|
|
- if (group->polyStations.size() == 0) continue;
|
34
|
|
-
|
35
|
|
- for (const auto& name : _stats.getGroup(gid)->uniqueNames) {
|
36
|
|
- // use wstring to get UTF-8 chars right
|
37
|
|
- std::wstring wname =
|
38
|
|
- std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(name);
|
39
|
|
-
|
40
|
|
- _nameToGroup[nameid] = gid;
|
41
|
|
- _names.push_back(name);
|
42
|
|
-
|
43
|
|
- for (const auto& token : tokenize(wname)) {
|
44
|
|
- if (tokenIds.count(token)) {
|
45
|
|
- if (_inv[tokenIds.find(token)->second].size() == 0 ||
|
46
|
|
- _inv[tokenIds.find(token)->second].back().first != nameid) {
|
47
|
|
- // only use a token once per station
|
48
|
|
- _inv[tokenIds.find(token)->second].push_back({nameid, 1});
|
|
28
|
+ for (const auto& idx : idxs) {
|
|
29
|
+ for (size_t gid = 0; gid < idx.second.getGroups().size(); gid++) {
|
|
30
|
+ auto group = idx.second.getGroup(gid);
|
|
31
|
+
|
|
32
|
+ // slightly prefer larger stations
|
|
33
|
+ double groupScore = 1.0 + log(group->stations.size()) / 20;
|
|
34
|
+
|
|
35
|
+ // dont index empty groups
|
|
36
|
+ if (group->stations.size() == 0) continue;
|
|
37
|
+ if (group->polyStations.size() == 0) continue;
|
|
38
|
+
|
|
39
|
+ for (const auto& namep : group->uniqueNames) {
|
|
40
|
+ // use wstring to get UTF-8 chars right
|
|
41
|
+ const auto& name = namep.second;
|
|
42
|
+ std::wstring wname =
|
|
43
|
+ std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(name);
|
|
44
|
+
|
|
45
|
+ _nameToGroup[nameid] = idx.first.gidStart + gid;
|
|
46
|
+ _names.push_back(name);
|
|
47
|
+
|
|
48
|
+ for (const auto& token : tokenize(wname)) {
|
|
49
|
+ if (tokenIds.count(token)) {
|
|
50
|
+ if (_inv[tokenIds.find(token)->second].size() == 0 ||
|
|
51
|
+ _inv[tokenIds.find(token)->second].back().first != nameid) {
|
|
52
|
+ // only use a token once per station
|
|
53
|
+ _inv[tokenIds.find(token)->second].push_back(
|
|
54
|
+ {nameid, groupScore});
|
|
55
|
+ }
|
|
56
|
+ } else {
|
|
57
|
+ tokenIds[token] = _tokens.size();
|
|
58
|
+ _tokens.push_back(token);
|
|
59
|
+ _inv.push_back(TupleList());
|
|
60
|
+ _inv.back().push_back({nameid, groupScore});
|
49
|
61
|
}
|
50
|
|
- } else {
|
51
|
|
- tokenIds[token] = _tokens.size();
|
52
|
|
- _tokens.push_back(token);
|
53
|
|
- _inv.push_back(TupleList());
|
54
|
|
- _inv.back().push_back({nameid, 1});
|
55
|
|
- }
|
56
|
62
|
|
57
|
|
- size_t tokenId = tokenIds.find(token)->second;
|
58
|
|
- for (const auto& qGram : getQGrams(token)) {
|
59
|
|
- if (_qGramIndex[qGram].size() &&
|
60
|
|
- _qGramIndex[qGram].back().first == tokenId) {
|
61
|
|
- _qGramIndex[qGram].back().second++;
|
62
|
|
- } else {
|
63
|
|
- _qGramIndex[qGram].push_back({tokenId, 1});
|
|
63
|
+ size_t tokenId = tokenIds.find(token)->second;
|
|
64
|
+ for (const auto& qGram : getQGrams(token)) {
|
|
65
|
+ if (_qGramIndex[qGram].size() &&
|
|
66
|
+ _qGramIndex[qGram].back().first == tokenId) {
|
|
67
|
+ _qGramIndex[qGram].back().second++;
|
|
68
|
+ } else {
|
|
69
|
+ _qGramIndex[qGram].push_back({tokenId, 1});
|
|
70
|
+ }
|
64
|
71
|
}
|
65
|
72
|
}
|
|
73
|
+ nameid++;
|
66
|
74
|
}
|
67
|
|
- nameid++;
|
68
|
75
|
}
|
69
|
76
|
}
|
70
|
77
|
|
|
@@ -101,7 +108,7 @@ std::vector<std::wstring> SearchIdx::tokenize(const std::wstring& str) {
|
101
|
108
|
std::vector<std::wstring> ret;
|
102
|
109
|
std::wstring cur;
|
103
|
110
|
|
104
|
|
- const wchar_t* seps = L"_-?'\"|!@#$%^&*()_+}|><.,\\";
|
|
111
|
+ const wchar_t* seps = L"_-?'\"|!@#$%^&*()_+{}[]|<>.:,\\/";
|
105
|
112
|
|
106
|
113
|
for (size_t i = 0; i < str.size(); i++) {
|
107
|
114
|
if (std::iswspace(str[i]) || wcschr(seps, str[i])) {
|
|
@@ -147,42 +154,43 @@ TripleList SearchIdx::find(const std::string& qry) const {
|
147
|
154
|
double delta = token.size() / 4.0;
|
148
|
155
|
auto res = tokenFind(token);
|
149
|
156
|
|
150
|
|
- std::partial_sort(res.begin(), res.begin() + std::min<size_t>(100, res.size()), res.end(), resComp);
|
|
157
|
+ std::partial_sort(res.begin(),
|
|
158
|
+ res.begin() + std::min<size_t>(100, res.size()),
|
|
159
|
+ res.end(), resComp);
|
151
|
160
|
|
152
|
161
|
std::map<size_t, double> bests;
|
153
|
162
|
std::map<size_t, size_t> bestToken;
|
154
|
163
|
|
155
|
|
- size_t TOPK = 100;
|
|
164
|
+ size_t TOPK = 100;
|
156
|
165
|
|
157
|
|
- // res contains the 100 best token matches
|
|
166
|
+ // res contains the 100 best token matches
|
158
|
167
|
for (size_t i = 0; i < res.size() && i < TOPK; i++) {
|
159
|
168
|
for (size_t j = 0; j < _inv[res[i].first].size(); j++) {
|
|
169
|
+ double score =
|
|
170
|
+ _inv[res[i].first][j].second * (delta / (1.0 + res[i].second));
|
160
|
171
|
|
161
|
|
- double score = _inv[res[i].first][j].second * (delta / (1.0 + res[i].second));
|
162
|
|
-
|
163
|
|
- if (score > bests[_inv[res[i].first][j].first]) {
|
164
|
|
- bests[_inv[res[i].first][j].first] = score;
|
165
|
|
- bestToken[_inv[res[i].first][j].first] = res[i].first;
|
166
|
|
- }
|
|
172
|
+ if (score > bests[_inv[res[i].first][j].first]) {
|
|
173
|
+ bests[_inv[res[i].first][j].first] = score;
|
|
174
|
+ bestToken[_inv[res[i].first][j].first] = res[i].first;
|
|
175
|
+ }
|
167
|
176
|
}
|
168
|
177
|
}
|
169
|
178
|
|
170
|
179
|
for (size_t i = 0; i < res.size() && i < TOPK; i++) {
|
171
|
|
- // inv[res[i]] contains all the names the token res[i] occurs in
|
|
180
|
+ // inv[res[i]] contains all the names the token res[i] occurs in
|
172
|
181
|
|
173
|
182
|
lists.push_back(_inv[res[i].first]);
|
174
|
183
|
|
175
|
184
|
// give them a score based on their PED
|
176
|
|
- for (size_t j = 0; j < lists.back().size(); j++) {
|
177
|
|
- double score = lists.back()[j].second *
|
178
|
|
- (delta / (1.0 + res[i].second));
|
179
|
|
- // best is the token for this name that matched best for the input token
|
180
|
|
- size_t best = bestToken[lists.back()[j].first];
|
181
|
|
-
|
182
|
|
- // if it is not this token, we dont count it
|
183
|
|
- if (res[i].first != best) score = 0;
|
|
185
|
+ for (size_t j = 0; j < lists.back().size(); j++) {
|
|
186
|
+ double score = lists.back()[j].second * (delta / (1.0 + res[i].second));
|
|
187
|
+ // best is the token for this name that matched best for the input token
|
|
188
|
+ size_t best = bestToken[lists.back()[j].first];
|
|
189
|
+
|
|
190
|
+ // if it is not this token, we dont count it
|
|
191
|
+ if (res[i].first != best) score = 0;
|
184
|
192
|
lists.back()[j].second = score;
|
185
|
|
- }
|
|
193
|
+ }
|
186
|
194
|
}
|
187
|
195
|
}
|
188
|
196
|
|
|
@@ -201,7 +209,8 @@ TripleList SearchIdx::find(const std::string& qry) const {
|
201
|
209
|
|
202
|
210
|
TripleList fin;
|
203
|
211
|
for (const auto& r : fr) {
|
204
|
|
- if (fin.size() == 0 || fin.back().first.first != r.first.first) fin.push_back(r);
|
|
212
|
+ if (fin.size() == 0 || fin.back().first.first != r.first.first)
|
|
213
|
+ fin.push_back(r);
|
205
|
214
|
}
|
206
|
215
|
|
207
|
216
|
std::partial_sort(fin.begin(), fin.begin() + std::min<size_t>(fin.size(), 10),
|