be/src/exec/common/multi_string_searcher.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <algorithm> |
21 | | #include <cstddef> |
22 | | #include <cstdint> |
23 | | #include <cstring> |
24 | | #include <limits> |
25 | | #include <memory> |
26 | | #include <vector> |
27 | | |
28 | | #include "core/string_ref.h" |
29 | | #include "core/types.h" |
30 | | #include "exec/common/string_searcher.h" |
31 | | |
32 | | namespace doris { |
33 | | |
34 | | // Searches many literal needles in one haystack. The table stores 2-byte ngrams from a |
35 | | // batch of needles, so scanning the haystack can find candidates for many needles at once. |
36 | | class MultiStringSearcher { |
37 | | using Offset = uint8_t; |
38 | | using Id = uint8_t; |
39 | | using Ngram = uint16_t; |
40 | | |
41 | | // One table entry is one candidate ngram occurrence in a needle. It stores the global |
42 | | // needle id and the 1-based offset of this ngram inside that needle. off == 0 means |
43 | | // the slot is empty, so needle offsets are stored as 1-based values. |
44 | | struct OffsetId { |
45 | | Id id = 0; |
46 | | Offset off = 0; |
47 | | }; |
48 | | |
49 | | public: |
50 | | explicit MultiStringSearcher(const std::vector<StringRef>& needles) |
51 | 3 | : _needles(needles), _hash(new OffsetId[HASH_SIZE]) { |
52 | 3 | _searchers.reserve(_needles.size()); |
53 | 15 | for (const auto& needle : _needles) { |
54 | 15 | _searchers.emplace_back(needle.data, needle.size); |
55 | 15 | } |
56 | 3 | } |
57 | | |
58 | 6 | bool has_more_to_search() { |
59 | 6 | if (_last == _needles.size()) { |
60 | 3 | return false; |
61 | 3 | } |
62 | | |
63 | 3 | std::memset(_hash.get(), 0, HASH_SIZE * sizeof(OffsetId)); |
64 | 3 | _fallback_needles.clear(); |
65 | 3 | _step = std::numeric_limits<size_t>::max(); |
66 | | |
67 | | // Not all needles are necessarily inserted in one table. Keeping the load factor |
68 | | // low makes the linear-probing chains short; if the current batch reaches |
69 | | // SMALL_LIMIT, the caller will scan all rows with this batch and call this method |
70 | | // again for the next batch. _last is the global needle index of the next batch. |
71 | 3 | size_t ngrams = 0; |
72 | 18 | for (; _last < _needles.size(); ++_last) { |
73 | 15 | const auto& needle = _needles[_last]; |
74 | 15 | if (is_fallback_needle(needle.size)) { |
75 | 3 | _fallback_needles.push_back(_last); |
76 | 12 | } else { |
77 | 12 | const size_t needle_ngrams = needle.size - sizeof(Ngram) + 1; |
78 | 12 | if (ngrams + needle_ngrams > SMALL_LIMIT && ngrams != 0) { |
79 | 0 | break; |
80 | 0 | } |
81 | | |
82 | 12 | ngrams += needle_ngrams; |
83 | 12 | _step = std::min(_step, needle_ngrams); |
84 | | // Insert ngrams from the end to the beginning, matching the way the scan |
85 | | // position is later converted back to the candidate needle start. |
86 | 60 | for (auto i = static_cast<int>(needle.size - sizeof(Ngram)); i >= 0; --i) { |
87 | 48 | put_ngram(to_ngram(reinterpret_cast<const uint8_t*>(needle.data + i)), i + 1, |
88 | 48 | _last); |
89 | 48 | } |
90 | 12 | } |
91 | 15 | } |
92 | | |
93 | 3 | return true; |
94 | 6 | } |
95 | | |
96 | 8 | void search_one_all(const uint8_t* haystack, const uint8_t* haystack_end, Int32* answer) const { |
97 | | // answer points to the beginning of one result array row. Entries are indexed by |
98 | | // global needle id, so different batches write into the same full-size row result. |
99 | 8 | for (const size_t needle_index : _fallback_needles) { |
100 | 8 | const auto* match = _searchers[needle_index].search(haystack, haystack_end); |
101 | 8 | if (match != haystack_end) { |
102 | 5 | answer[needle_index] = static_cast<Int32>(match - haystack + 1); |
103 | 5 | } |
104 | 8 | } |
105 | | |
106 | 8 | const auto haystack_size = static_cast<size_t>(haystack_end - haystack); |
107 | 8 | if (_step == std::numeric_limits<size_t>::max() || haystack_size < sizeof(Ngram)) { |
108 | | // The batch may contain only fallback needles, or the haystack may be too short |
109 | | // to read a 2-byte ngram. Fallback needles have already been handled above. |
110 | 2 | return; |
111 | 2 | } |
112 | | |
113 | 19 | for (size_t pos_offset = _step - sizeof(Ngram); pos_offset + sizeof(Ngram) <= haystack_size; |
114 | 13 | pos_offset += _step) { |
115 | 13 | const uint8_t* pos = haystack + pos_offset; |
116 | | // A 2-byte ngram has exactly 65536 possible values. HASH_SIZE is also 65536, |
117 | | // so ngram % HASH_SIZE is effectively direct addressing. Collisions and |
118 | | // duplicate ngrams are kept by linear probing and checked until an empty slot. |
119 | 29 | for (size_t cell = to_ngram(pos) % HASH_SIZE; _hash[cell].off; |
120 | 16 | cell = (cell + 1) % HASH_SIZE) { |
121 | 16 | const size_t ngram_offset = _hash[cell].off - 1; |
122 | 16 | if (pos_offset < ngram_offset) { |
123 | 0 | continue; |
124 | 0 | } |
125 | | |
126 | 16 | const size_t needle_index = _hash[cell].id; |
127 | | // Convert the matched ngram position in the haystack back to the candidate |
128 | | // start position of the whole needle, then verify the full needle below. |
129 | 16 | const size_t match_offset = pos_offset - ngram_offset; |
130 | 16 | const uint8_t* match = haystack + match_offset; |
131 | 16 | const auto candidate_position = static_cast<Int32>(match_offset + 1); |
132 | 16 | if (answer[needle_index] != 0 && candidate_position >= answer[needle_index]) { |
133 | 0 | continue; |
134 | 0 | } |
135 | | |
136 | 16 | const auto& needle = _needles[needle_index]; |
137 | | // The hash table only proves that one 2-byte ngram matched. Full memcmp is |
138 | | // required to discard false positives from duplicate ngrams and collisions. |
139 | 16 | if (haystack_size - match_offset >= needle.size && |
140 | 16 | std::memcmp(match, needle.data, needle.size) == 0) { |
141 | 8 | answer[needle_index] = candidate_position; |
142 | 8 | } |
143 | 16 | } |
144 | 13 | } |
145 | 6 | } |
146 | | |
147 | | private: |
148 | | static constexpr size_t HASH_SIZE = 64 * 1024; |
149 | | static constexpr size_t SMALL_LIMIT = HASH_SIZE / 8; |
150 | | |
151 | 15 | static bool is_fallback_needle(size_t needle_size) { |
152 | | // Very short needles do not have enough 2-byte ngrams to be selective. Very long |
153 | | // needles cannot store their ngram offset in the uint8_t Offset field. |
154 | 15 | return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max(); |
155 | 15 | } |
156 | | |
157 | 61 | static Ngram to_ngram(const uint8_t* pos) { |
158 | | // Read two bytes as a uint16_t. On little-endian machines, for example, "ab" |
159 | | // becomes 97 + 98 * 256. This numeric value is used as the initial hash slot. |
160 | 61 | Ngram ngram; |
161 | 61 | std::memcpy(&ngram, pos, sizeof(ngram)); |
162 | 61 | return ngram; |
163 | 61 | } |
164 | | |
165 | 48 | void put_ngram(Ngram ngram, int offset, size_t needle_index) { |
166 | 48 | size_t cell = ngram % HASH_SIZE; |
167 | | // Linear probing keeps all duplicate ngrams instead of overwriting them. Search |
168 | | // uses the same probe sequence and validates every candidate with full memcmp. |
169 | 66 | while (_hash[cell].off) { |
170 | 18 | cell = (cell + 1) % HASH_SIZE; |
171 | 18 | } |
172 | 48 | _hash[cell] = {.id = static_cast<Id>(needle_index), .off = static_cast<Offset>(offset)}; |
173 | 48 | } |
174 | | |
175 | | const std::vector<StringRef>& _needles; |
176 | | std::vector<size_t> _fallback_needles; |
177 | | std::vector<ASCIICaseSensitiveStringSearcher> _searchers; |
178 | | std::unique_ptr<OffsetId[]> _hash; |
179 | | size_t _step = 0; |
180 | | size_t _last = 0; |
181 | | }; |
182 | | |
183 | | } // namespace doris |