Coverage Report

Created: 2026-06-02 12:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/multi_string_searcher.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <algorithm>
21
#include <cstddef>
22
#include <cstdint>
23
#include <cstring>
24
#include <limits>
25
#include <memory>
26
#include <vector>
27
28
#include "core/string_ref.h"
29
#include "core/types.h"
30
#include "exec/common/string_searcher.h"
31
32
namespace doris {
33
34
// Searches many literal needles in one haystack. The table stores 2-byte ngrams from a
35
// batch of needles, so scanning the haystack can find candidates for many needles at once.
36
class MultiStringSearcher {
37
    using Offset = uint8_t;
38
    using Id = uint8_t;
39
    using Ngram = uint16_t;
40
41
    // One table entry is one candidate ngram occurrence in a needle. It stores the global
42
    // needle id and the 1-based offset of this ngram inside that needle. off == 0 means
43
    // the slot is empty, so needle offsets are stored as 1-based values.
44
    struct OffsetId {
45
        Id id = 0;
46
        Offset off = 0;
47
    };
48
49
public:
50
    explicit MultiStringSearcher(const std::vector<StringRef>& needles)
51
3
            : _needles(needles), _hash(new OffsetId[HASH_SIZE]) {
52
3
        _searchers.reserve(_needles.size());
53
15
        for (const auto& needle : _needles) {
54
15
            _searchers.emplace_back(needle.data, needle.size);
55
15
        }
56
3
    }
57
58
6
    bool has_more_to_search() {
59
6
        if (_last == _needles.size()) {
60
3
            return false;
61
3
        }
62
63
3
        std::memset(_hash.get(), 0, HASH_SIZE * sizeof(OffsetId));
64
3
        _fallback_needles.clear();
65
3
        _step = std::numeric_limits<size_t>::max();
66
67
        // Not all needles are necessarily inserted in one table. Keeping the load factor
68
        // low makes the linear-probing chains short; if the current batch reaches
69
        // SMALL_LIMIT, the caller will scan all rows with this batch and call this method
70
        // again for the next batch. _last is the global needle index of the next batch.
71
3
        size_t ngrams = 0;
72
18
        for (; _last < _needles.size(); ++_last) {
73
15
            const auto& needle = _needles[_last];
74
15
            if (is_fallback_needle(needle.size)) {
75
3
                _fallback_needles.push_back(_last);
76
12
            } else {
77
12
                const size_t needle_ngrams = needle.size - sizeof(Ngram) + 1;
78
12
                if (ngrams + needle_ngrams > SMALL_LIMIT && ngrams != 0) {
79
0
                    break;
80
0
                }
81
82
12
                ngrams += needle_ngrams;
83
12
                _step = std::min(_step, needle_ngrams);
84
                // Insert ngrams from the end to the beginning, matching the way the scan
85
                // position is later converted back to the candidate needle start.
86
60
                for (auto i = static_cast<int>(needle.size - sizeof(Ngram)); i >= 0; --i) {
87
48
                    put_ngram(to_ngram(reinterpret_cast<const uint8_t*>(needle.data + i)), i + 1,
88
48
                              _last);
89
48
                }
90
12
            }
91
15
        }
92
93
3
        return true;
94
6
    }
95
96
8
    void search_one_all(const uint8_t* haystack, const uint8_t* haystack_end, Int32* answer) const {
97
        // answer points to the beginning of one result array row. Entries are indexed by
98
        // global needle id, so different batches write into the same full-size row result.
99
8
        for (const size_t needle_index : _fallback_needles) {
100
8
            const auto* match = _searchers[needle_index].search(haystack, haystack_end);
101
8
            if (match != haystack_end) {
102
5
                answer[needle_index] = static_cast<Int32>(match - haystack + 1);
103
5
            }
104
8
        }
105
106
8
        const auto haystack_size = static_cast<size_t>(haystack_end - haystack);
107
8
        if (_step == std::numeric_limits<size_t>::max() || haystack_size < sizeof(Ngram)) {
108
            // The batch may contain only fallback needles, or the haystack may be too short
109
            // to read a 2-byte ngram. Fallback needles have already been handled above.
110
2
            return;
111
2
        }
112
113
19
        for (size_t pos_offset = _step - sizeof(Ngram); pos_offset + sizeof(Ngram) <= haystack_size;
114
13
             pos_offset += _step) {
115
13
            const uint8_t* pos = haystack + pos_offset;
116
            // A 2-byte ngram has exactly 65536 possible values. HASH_SIZE is also 65536,
117
            // so ngram % HASH_SIZE is effectively direct addressing. Collisions and
118
            // duplicate ngrams are kept by linear probing and checked until an empty slot.
119
29
            for (size_t cell = to_ngram(pos) % HASH_SIZE; _hash[cell].off;
120
16
                 cell = (cell + 1) % HASH_SIZE) {
121
16
                const size_t ngram_offset = _hash[cell].off - 1;
122
16
                if (pos_offset < ngram_offset) {
123
0
                    continue;
124
0
                }
125
126
16
                const size_t needle_index = _hash[cell].id;
127
                // Convert the matched ngram position in the haystack back to the candidate
128
                // start position of the whole needle, then verify the full needle below.
129
16
                const size_t match_offset = pos_offset - ngram_offset;
130
16
                const uint8_t* match = haystack + match_offset;
131
16
                const auto candidate_position = static_cast<Int32>(match_offset + 1);
132
16
                if (answer[needle_index] != 0 && candidate_position >= answer[needle_index]) {
133
0
                    continue;
134
0
                }
135
136
16
                const auto& needle = _needles[needle_index];
137
                // The hash table only proves that one 2-byte ngram matched. Full memcmp is
138
                // required to discard false positives from duplicate ngrams and collisions.
139
16
                if (haystack_size - match_offset >= needle.size &&
140
16
                    std::memcmp(match, needle.data, needle.size) == 0) {
141
8
                    answer[needle_index] = candidate_position;
142
8
                }
143
16
            }
144
13
        }
145
6
    }
146
147
private:
148
    static constexpr size_t HASH_SIZE = 64 * 1024;
149
    static constexpr size_t SMALL_LIMIT = HASH_SIZE / 8;
150
151
15
    static bool is_fallback_needle(size_t needle_size) {
152
        // Very short needles do not have enough 2-byte ngrams to be selective. Very long
153
        // needles cannot store their ngram offset in the uint8_t Offset field.
154
15
        return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max();
155
15
    }
156
157
61
    static Ngram to_ngram(const uint8_t* pos) {
158
        // Read two bytes as a uint16_t. On little-endian machines, for example, "ab"
159
        // becomes 97 + 98 * 256. This numeric value is used as the initial hash slot.
160
61
        Ngram ngram;
161
61
        std::memcpy(&ngram, pos, sizeof(ngram));
162
61
        return ngram;
163
61
    }
164
165
48
    void put_ngram(Ngram ngram, int offset, size_t needle_index) {
166
48
        size_t cell = ngram % HASH_SIZE;
167
        // Linear probing keeps all duplicate ngrams instead of overwriting them. Search
168
        // uses the same probe sequence and validates every candidate with full memcmp.
169
66
        while (_hash[cell].off) {
170
18
            cell = (cell + 1) % HASH_SIZE;
171
18
        }
172
48
        _hash[cell] = {.id = static_cast<Id>(needle_index), .off = static_cast<Offset>(offset)};
173
48
    }
174
175
    const std::vector<StringRef>& _needles;
176
    std::vector<size_t> _fallback_needles;
177
    std::vector<ASCIICaseSensitiveStringSearcher> _searchers;
178
    std::unique_ptr<OffsetId[]> _hash;
179
    size_t _step = 0;
180
    size_t _last = 0;
181
};
182
183
} // namespace doris