be/src/exec/common/multi_string_searcher.h

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <vector>

#include "core/string_ref.h"
#include "core/types.h"
#include "exec/common/string_searcher.h"

namespace doris {

// Searches many literal needles in one haystack. The table stores 2-byte ngrams from a
// batch of needles, so scanning the haystack can find candidates for many needles at once.
class MultiStringSearcher {
    using Offset = uint8_t;
    using Id = uint8_t;
    using Ngram = uint16_t;

    // One table entry is one candidate ngram occurrence in a needle. It stores the global
    // needle id and the 1-based offset of this ngram inside that needle. off == 0 means
    // the slot is empty, so needle offsets are stored as 1-based values.
    struct OffsetId {
        Id id = 0;
        Offset off = 0;
    };

public:
    explicit MultiStringSearcher(const std::vector<StringRef>& needles)
            : _needles(needles), _hash(new OffsetId[HASH_SIZE]) {
        _searchers.reserve(_needles.size());
        for (const auto& needle : _needles) {
            _searchers.emplace_back(needle.data, needle.size);
        }
    }

    bool has_more_to_search() {
        if (_last == _needles.size()) {
            return false;
        }

        std::memset(_hash.get(), 0, HASH_SIZE * sizeof(OffsetId));
        _fallback_needles.clear();
        _step = std::numeric_limits<size_t>::max();

        // Not all needles are necessarily inserted in one table. Keeping the load factor
        // low makes the linear-probing chains short; if the current batch reaches
        // SMALL_LIMIT, the caller will scan all rows with this batch and call this method
        // again for the next batch. _last is the global needle index of the next batch.
        size_t ngrams = 0;
        for (; _last < _needles.size(); ++_last) {
            const auto& needle = _needles[_last];
            if (is_fallback_needle(needle.size)) {
                _fallback_needles.push_back(_last);
            } else {
                const size_t needle_ngrams = needle.size - sizeof(Ngram) + 1;
                if (ngrams + needle_ngrams > SMALL_LIMIT && ngrams != 0) {
                    break;
                }

                ngrams += needle_ngrams;
                _step = std::min(_step, needle_ngrams);
                // Insert ngrams from the end to the beginning, matching the way the scan
                // position is later converted back to the candidate needle start.
                for (auto i = static_cast<int>(needle.size - sizeof(Ngram)); i >= 0; --i) {
                    put_ngram(to_ngram(reinterpret_cast<const uint8_t*>(needle.data + i)), i + 1,
                              _last);
                }
            }
        }

        return true;
    }

    void search_one_all(const uint8_t* haystack, const uint8_t* haystack_end, Int32* answer) const {
        // answer points to the beginning of one result array row. Entries are indexed by
        // global needle id, so different batches write into the same full-size row result.
        for (const size_t needle_index : _fallback_needles) {
            const auto* match = _searchers[needle_index].search(haystack, haystack_end);
            if (match != haystack_end) {
                answer[needle_index] = static_cast<Int32>(match - haystack + 1);
            }
        }

        const auto haystack_size = static_cast<size_t>(haystack_end - haystack);
        if (_step == std::numeric_limits<size_t>::max() || haystack_size < sizeof(Ngram)) {
            // The batch may contain only fallback needles, or the haystack may be too short
            // to read a 2-byte ngram. Fallback needles have already been handled above.
            return;
        }

        for (size_t pos_offset = _step - sizeof(Ngram); pos_offset + sizeof(Ngram) <= haystack_size;
             pos_offset += _step) {
            const uint8_t* pos = haystack + pos_offset;
            // A 2-byte ngram has exactly 65536 possible values. HASH_SIZE is also 65536,
            // so ngram % HASH_SIZE is effectively direct addressing. Collisions and
            // duplicate ngrams are kept by linear probing and checked until an empty slot.
            for (size_t cell = to_ngram(pos) % HASH_SIZE; _hash[cell].off;
                 cell = (cell + 1) % HASH_SIZE) {
                const size_t ngram_offset = _hash[cell].off - 1;
                if (pos_offset < ngram_offset) {
                    continue;
                }

                const size_t needle_index = _hash[cell].id;
                // Convert the matched ngram position in the haystack back to the candidate
                // start position of the whole needle, then verify the full needle below.
                const size_t match_offset = pos_offset - ngram_offset;
                const uint8_t* match = haystack + match_offset;
                const auto candidate_position = static_cast<Int32>(match_offset + 1);
                if (answer[needle_index] != 0 && candidate_position >= answer[needle_index]) {
                    continue;
                }

                const auto& needle = _needles[needle_index];
                // The hash table only proves that one 2-byte ngram matched. Full memcmp is
                // required to discard false positives from duplicate ngrams and collisions.
                if (haystack_size - match_offset >= needle.size &&
                    std::memcmp(match, needle.data, needle.size) == 0) {
                    answer[needle_index] = candidate_position;
                }
            }
        }
    }

private:
    static constexpr size_t HASH_SIZE = 64 * 1024;
    static constexpr size_t SMALL_LIMIT = HASH_SIZE / 8;

    static bool is_fallback_needle(size_t needle_size) {
        // Very short needles do not have enough 2-byte ngrams to be selective. Very long
        // needles cannot store their ngram offset in the uint8_t Offset field.
        return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max();
    }

    static Ngram to_ngram(const uint8_t* pos) {
        // Read two bytes as a uint16_t. On little-endian machines, for example, "ab"
        // becomes 97 + 98 * 256. This numeric value is used as the initial hash slot.
        Ngram ngram;
        std::memcpy(&ngram, pos, sizeof(ngram));
        return ngram;
    }

    void put_ngram(Ngram ngram, int offset, size_t needle_index) {
        size_t cell = ngram % HASH_SIZE;
        // Linear probing keeps all duplicate ngrams instead of overwriting them. Search
        // uses the same probe sequence and validates every candidate with full memcmp.
        while (_hash[cell].off) {
            cell = (cell + 1) % HASH_SIZE;
        }
        _hash[cell] = {.id = static_cast<Id>(needle_index), .off = static_cast<Offset>(offset)};
    }

    const std::vector<StringRef>& _needles;
    std::vector<size_t> _fallback_needles;
    std::vector<ASCIICaseSensitiveStringSearcher> _searchers;
    std::unique_ptr<OffsetId[]> _hash;
    size_t _step = 0;
    size_t _last = 0;
};

} // namespace doris

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#pragma once
19
20		#include <algorithm>
21		#include <cstddef>
22		#include <cstdint>
23		#include <cstring>
24		#include <limits>
25		#include <memory>
26		#include <vector>
27
28		#include "core/string_ref.h"
29		#include "core/types.h"
30		#include "exec/common/string_searcher.h"
31
32		namespace doris {
33
34		// Searches many literal needles in one haystack. The table stores 2-byte ngrams from a
35		// batch of needles, so scanning the haystack can find candidates for many needles at once.
36		class MultiStringSearcher {
37		using Offset = uint8_t;
38		using Id = uint8_t;
39		using Ngram = uint16_t;
40
41		// One table entry is one candidate ngram occurrence in a needle. It stores the global
42		// needle id and the 1-based offset of this ngram inside that needle. off == 0 means
43		// the slot is empty, so needle offsets are stored as 1-based values.
44		struct OffsetId {
45		Id id = 0;
46		Offset off = 0;
47		};
48
49		public:
50		explicit MultiStringSearcher(const std::vector<StringRef>& needles)
51	3	: _needles(needles), _hash(new OffsetId[HASH_SIZE]) {
52	3	_searchers.reserve(_needles.size());
53	15	for (const auto& needle : _needles) {
54	15	_searchers.emplace_back(needle.data, needle.size);
55	15	}
56	3	}
57
58	6	bool has_more_to_search() {
59	6	if (_last == _needles.size()) {
60	3	return false;
61	3	}
62
63	3	std::memset(_hash.get(), 0, HASH_SIZE * sizeof(OffsetId));
64	3	_fallback_needles.clear();
65	3	_step = std::numeric_limits<size_t>::max();
66
67		// Not all needles are necessarily inserted in one table. Keeping the load factor
68		// low makes the linear-probing chains short; if the current batch reaches
69		// SMALL_LIMIT, the caller will scan all rows with this batch and call this method
70		// again for the next batch. _last is the global needle index of the next batch.
71	3	size_t ngrams = 0;
72	18	for (; _last < _needles.size(); ++_last) {
73	15	const auto& needle = _needles[_last];
74	15	if (is_fallback_needle(needle.size)) {
75	3	_fallback_needles.push_back(_last);
76	12	} else {
77	12	const size_t needle_ngrams = needle.size - sizeof(Ngram) + 1;
78	12	if (ngrams + needle_ngrams > SMALL_LIMIT && ngrams != 0) {
79	0	break;
80	0	}
81
82	12	ngrams += needle_ngrams;
83	12	_step = std::min(_step, needle_ngrams);
84		// Insert ngrams from the end to the beginning, matching the way the scan
85		// position is later converted back to the candidate needle start.
86	60	for (auto i = static_cast<int>(needle.size - sizeof(Ngram)); i >= 0; --i) {
87	48	put_ngram(to_ngram(reinterpret_cast<const uint8_t*>(needle.data + i)), i + 1,
88	48	_last);
89	48	}
90	12	}
91	15	}
92
93	3	return true;
94	6	}
95
96	8	void search_one_all(const uint8_t* haystack, const uint8_t* haystack_end, Int32* answer) const {
97		// answer points to the beginning of one result array row. Entries are indexed by
98		// global needle id, so different batches write into the same full-size row result.
99	8	for (const size_t needle_index : _fallback_needles) {
100	8	const auto* match = _searchers[needle_index].search(haystack, haystack_end);
101	8	if (match != haystack_end) {
102	5	answer[needle_index] = static_cast<Int32>(match - haystack + 1);
103	5	}
104	8	}
105
106	8	const auto haystack_size = static_cast<size_t>(haystack_end - haystack);
107	8	if (_step == std::numeric_limits<size_t>::max() \|\| haystack_size < sizeof(Ngram)) {
108		// The batch may contain only fallback needles, or the haystack may be too short
109		// to read a 2-byte ngram. Fallback needles have already been handled above.
110	2	return;
111	2	}
112
113	19	for (size_t pos_offset = _step - sizeof(Ngram); pos_offset + sizeof(Ngram) <= haystack_size;
114	13	pos_offset += _step) {
115	13	const uint8_t* pos = haystack + pos_offset;
116		// A 2-byte ngram has exactly 65536 possible values. HASH_SIZE is also 65536,
117		// so ngram % HASH_SIZE is effectively direct addressing. Collisions and
118		// duplicate ngrams are kept by linear probing and checked until an empty slot.
119	29	for (size_t cell = to_ngram(pos) % HASH_SIZE; _hash[cell].off;
120	16	cell = (cell + 1) % HASH_SIZE) {
121	16	const size_t ngram_offset = _hash[cell].off - 1;
122	16	if (pos_offset < ngram_offset) {
123	0	continue;
124	0	}
125
126	16	const size_t needle_index = _hash[cell].id;
127		// Convert the matched ngram position in the haystack back to the candidate
128		// start position of the whole needle, then verify the full needle below.
129	16	const size_t match_offset = pos_offset - ngram_offset;
130	16	const uint8_t* match = haystack + match_offset;
131	16	const auto candidate_position = static_cast<Int32>(match_offset + 1);
132	16	if (answer[needle_index] != 0 && candidate_position >= answer[needle_index]) {
133	0	continue;
134	0	}
135
136	16	const auto& needle = _needles[needle_index];
137		// The hash table only proves that one 2-byte ngram matched. Full memcmp is
138		// required to discard false positives from duplicate ngrams and collisions.
139	16	if (haystack_size - match_offset >= needle.size &&
140	16	std::memcmp(match, needle.data, needle.size) == 0) {
141	8	answer[needle_index] = candidate_position;
142	8	}
143	16	}
144	13	}
145	6	}
146
147		private:
148		static constexpr size_t HASH_SIZE = 64 * 1024;
149		static constexpr size_t SMALL_LIMIT = HASH_SIZE / 8;
150
151	15	static bool is_fallback_needle(size_t needle_size) {
152		// Very short needles do not have enough 2-byte ngrams to be selective. Very long
153		// needles cannot store their ngram offset in the uint8_t Offset field.
154	15	return needle_size < 2 * sizeof(Ngram) \|\| needle_size >= std::numeric_limits<Offset>::max();
155	15	}
156
157	61	static Ngram to_ngram(const uint8_t* pos) {
158		// Read two bytes as a uint16_t. On little-endian machines, for example, "ab"
159		// becomes 97 + 98 * 256. This numeric value is used as the initial hash slot.
160	61	Ngram ngram;
161	61	std::memcpy(&ngram, pos, sizeof(ngram));
162	61	return ngram;
163	61	}
164
165	48	void put_ngram(Ngram ngram, int offset, size_t needle_index) {
166	48	size_t cell = ngram % HASH_SIZE;
167		// Linear probing keeps all duplicate ngrams instead of overwriting them. Search
168		// uses the same probe sequence and validates every candidate with full memcmp.
169	66	while (_hash[cell].off) {
170	18	cell = (cell + 1) % HASH_SIZE;
171	18	}
172	48	_hash[cell] = {.id = static_cast<Id>(needle_index), .off = static_cast<Offset>(offset)};
173	48	}
174
175		const std::vector<StringRef>& _needles;
176		std::vector<size_t> _fallback_needles;
177		std::vector<ASCIICaseSensitiveStringSearcher> _searchers;
178		std::unique_ptr<OffsetId[]> _hash;
179		size_t _step = 0;
180		size_t _last = 0;
181		};
182
183		} // namespace doris

Coverage Report

Created: 2026-06-02 12:10