/root/doris/be/src/vec/functions/regexps.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/Regexps.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include <hs/hs.h> |
24 | | #include <hs/hs_common.h> |
25 | | |
26 | | #include <boost/container_hash/hash.hpp> |
27 | | #include <memory> |
28 | | #include <mutex> |
29 | | #include <optional> |
30 | | #include <string> |
31 | | #include <utility> |
32 | | #include <vector> |
33 | | |
34 | | #include "common/exception.h" |
35 | | #include "vec/common/string_ref.h" |
36 | | |
37 | | namespace doris::vectorized::multiregexps { |
38 | | |
39 | | template <typename Deleter, Deleter deleter> |
40 | | struct HyperscanDeleter { |
41 | | template <typename T> |
42 | 0 | void operator()(T* ptr) const { |
43 | 0 | deleter(ptr); |
44 | 0 | } Unexecuted instantiation: _ZNK5doris10vectorized12multiregexps16HyperscanDeleterIPFiP16hs_compile_errorEXadL_Z21hs_free_compile_errorEEEclIS3_EEvPT_ Unexecuted instantiation: _ZNK5doris10vectorized12multiregexps16HyperscanDeleterIPFiP11hs_databaseEXadL_Z16hs_free_databaseEEEclIS3_EEvPT_ Unexecuted instantiation: _ZNK5doris10vectorized12multiregexps16HyperscanDeleterIPFiP10hs_scratchEXadL_Z15hs_free_scratchEEEclIS3_EEvPT_ |
45 | | }; |
46 | | |
47 | | /// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception. |
48 | | using CompilerError = |
49 | | std::unique_ptr<hs_compile_error_t, |
50 | | HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>; |
51 | | using ScratchPtr = std::unique_ptr<hs_scratch_t, |
52 | | HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>; |
53 | | using DataBasePtr = |
54 | | std::unique_ptr<hs_database_t, |
55 | | HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>; |
56 | | |
57 | | /// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher. |
58 | | class Regexps { |
59 | | public: |
60 | 0 | Regexps(hs_database_t* db_, hs_scratch_t* scratch_) : db {db_}, scratch {scratch_} {} |
61 | | |
62 | 0 | hs_database_t* getDB() const { return db.get(); } |
63 | 0 | hs_scratch_t* getScratch() const { return scratch.get(); } |
64 | | |
65 | | private: |
66 | | DataBasePtr db; |
67 | | ScratchPtr scratch; |
68 | | }; |
69 | | |
70 | | class DeferredConstructedRegexps { |
71 | | public: |
72 | | explicit DeferredConstructedRegexps(std::function<Regexps()> constructor_) |
73 | 0 | : constructor(std::move(constructor_)) {} |
74 | | |
75 | 0 | Regexps* get() { |
76 | 0 | std::lock_guard lock(mutex); |
77 | 0 | if (regexps) { |
78 | 0 | return &*regexps; |
79 | 0 | } |
80 | 0 | regexps = constructor(); |
81 | 0 | return &*regexps; |
82 | 0 | } |
83 | | |
84 | | private: |
85 | | std::mutex mutex; |
86 | | std::function<Regexps()> constructor; |
87 | | std::optional<Regexps> regexps; |
88 | | }; |
89 | | |
90 | | using DeferredConstructedRegexpsPtr = std::shared_ptr<DeferredConstructedRegexps>; |
91 | | |
92 | | template <bool save_indices, bool WithEditDistance> |
93 | | Regexps constructRegexps(const std::vector<String>& str_patterns, |
94 | 0 | [[maybe_unused]] std::optional<UInt32> edit_distance) { |
95 | | /// Common pointers |
96 | 0 | std::vector<const char*> patterns; |
97 | 0 | std::vector<unsigned int> flags; |
98 | | |
99 | | /// Pointer for external edit distance compilation |
100 | 0 | std::vector<hs_expr_ext> ext_exprs; |
101 | 0 | std::vector<const hs_expr_ext*> ext_exprs_ptrs; |
102 | |
|
103 | 0 | patterns.reserve(str_patterns.size()); |
104 | 0 | flags.reserve(str_patterns.size()); |
105 | |
|
106 | 0 | if constexpr (WithEditDistance) { |
107 | 0 | ext_exprs.reserve(str_patterns.size()); |
108 | 0 | ext_exprs_ptrs.reserve(str_patterns.size()); |
109 | 0 | } |
110 | |
|
111 | 0 | for (const auto& ref : str_patterns) { |
112 | 0 | patterns.push_back(ref.data()); |
113 | | /* Flags below are the pattern matching flags. |
114 | | * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good |
115 | | * performance practice according to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode |
116 | | * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match. |
117 | | * HS_FLAG_UTF8 is a flag where UTF8 literals are matched. |
118 | | * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice |
119 | | * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag |
120 | | */ |
121 | 0 | flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8); |
122 | 0 | if constexpr (WithEditDistance) { |
123 | | /// Hyperscan currently does not support UTF8 matching with edit distance. |
124 | 0 | flags.back() &= ~HS_FLAG_UTF8; |
125 | 0 | ext_exprs.emplace_back(); |
126 | | /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance. |
127 | 0 | ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE; |
128 | 0 | ext_exprs.back().edit_distance = edit_distance.value(); |
129 | 0 | ext_exprs_ptrs.push_back(&ext_exprs.back()); |
130 | 0 | } |
131 | 0 | } |
132 | 0 | hs_database_t* db = nullptr; |
133 | 0 | hs_compile_error_t* compile_error = nullptr; |
134 | |
|
135 | 0 | std::unique_ptr<unsigned int[]> ids; |
136 | | |
137 | | /// We mark the patterns to provide the callback results. |
138 | 0 | if constexpr (save_indices) { |
139 | 0 | ids.reset(new unsigned int[patterns.size()]); |
140 | 0 | for (size_t i = 0; i < patterns.size(); ++i) { |
141 | 0 | ids[i] = static_cast<unsigned>(i + 1); |
142 | 0 | } |
143 | 0 | } |
144 | |
|
145 | 0 | for (auto& pattern : patterns) { |
146 | 0 | LOG(INFO) << "pattern: " << pattern << "\n"; |
147 | 0 | } |
148 | |
|
149 | 0 | hs_error_t err; |
150 | 0 | if constexpr (!WithEditDistance) { |
151 | 0 | err = hs_compile_multi(patterns.data(), flags.data(), ids.get(), |
152 | 0 | static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db, |
153 | 0 | &compile_error); |
154 | 0 | } else { |
155 | 0 | err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.get(), ext_exprs_ptrs.data(), |
156 | 0 | static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, |
157 | 0 | &db, &compile_error); |
158 | 0 | } |
159 | |
|
160 | 0 | if (err != HS_SUCCESS) [[unlikely]] { |
161 | | /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown. |
162 | 0 | CompilerError error(compile_error); |
163 | |
|
164 | 0 | if (error->expression < 0) { // error has nothing to do with the patterns themselves |
165 | 0 | throw doris::Exception(Status::InternalError("Compile regexp expression failed. got {}", |
166 | 0 | error->message)); |
167 | 0 | } else { |
168 | 0 | throw doris::Exception(Status::InvalidArgument( |
169 | 0 | "Compile regexp expression failed. got {}. some expressions may be illegal", |
170 | 0 | error->message)); |
171 | 0 | } |
172 | 0 | } |
173 | | |
174 | | /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch |
175 | | /// function which is faster than allocating scratch space each time in each thread. |
176 | 0 | hs_scratch_t* scratch = nullptr; |
177 | 0 | err = hs_alloc_scratch(db, &scratch); |
178 | |
|
179 | 0 | if (err != HS_SUCCESS) [[unlikely]] { |
180 | 0 | if (err == HS_NOMEM) [[unlikely]] { |
181 | 0 | throw doris::Exception(Status::MemoryAllocFailed( |
182 | 0 | "Allocating memory failed on compiling regexp expressions.")); |
183 | 0 | } else { |
184 | 0 | throw doris::Exception(Status::InvalidArgument( |
185 | 0 | "Compile regexp expression failed with unexpected arguments perhaps")); |
186 | 0 | } |
187 | 0 | } |
188 | | |
189 | 0 | return {db, scratch}; |
190 | 0 | } |
191 | | |
192 | | /// Maps string pattern vectors + edit distance to compiled vectorscan regexps. Uses the same eviction mechanism as the LocalCacheTable for |
193 | | /// re2 patterns. Because vectorscan regexes are overall more heavy-weight (more expensive compilation, regexes can grow up to multiple |
194 | | /// MBs, usage of scratch space), 1. GlobalCacheTable is a global singleton and, as a result, needs locking 2. the pattern compilation is |
195 | | /// done outside GlobalCacheTable's lock, at the cost of another level of locking. |
196 | | struct GlobalCacheTable { |
197 | | constexpr static size_t CACHE_SIZE = 500; /// collision probability |
198 | | |
199 | | struct Bucket { |
200 | | std::vector<String> patterns; /// key |
201 | | std::optional<UInt32> edit_distance; /// key |
202 | | /// The compiled patterns and their state (vectorscan 'database' + scratch space) are wrapped in a shared_ptr. Refcounting guarantees |
203 | | /// that eviction of a pattern does not affect parallel threads still using the pattern. |
204 | | DeferredConstructedRegexpsPtr regexps; /// value |
205 | | }; |
206 | | |
207 | | std::mutex mutex; |
208 | | std::array<Bucket, CACHE_SIZE> known_regexps; |
209 | | |
210 | | static size_t getBucketIndexFor(const std::vector<String> patterns, |
211 | 0 | std::optional<UInt32> edit_distance) { |
212 | 0 | size_t hash = 0; |
213 | 0 | for (const auto& pattern : patterns) { |
214 | 0 | boost::hash_combine(hash, pattern); |
215 | 0 | } |
216 | 0 | boost::hash_combine(hash, edit_distance); |
217 | 0 | return hash % CACHE_SIZE; |
218 | 0 | } |
219 | | }; |
220 | | |
221 | | /// If WithEditDistance is False, edit_distance must be nullopt. Also, we use templates here because each instantiation of function template |
222 | | /// has its own copy of local static variables which must not be the same for different hyperscan compilations. |
223 | | template <bool save_indices, bool WithEditDistance> |
224 | | DeferredConstructedRegexpsPtr getOrSet(const std::vector<StringRef>& patterns, |
225 | 0 | std::optional<UInt32> edit_distance) { |
226 | 0 | static GlobalCacheTable |
227 | 0 | pool; /// Different variables for different pattern parameters, thread-safe in C++11 |
228 | |
|
229 | 0 | std::vector<String> str_patterns; |
230 | 0 | str_patterns.reserve(patterns.size()); |
231 | 0 | for (const auto& pattern : patterns) { |
232 | 0 | str_patterns.emplace_back(pattern.to_string()); |
233 | 0 | } |
234 | |
|
235 | 0 | size_t bucket_idx = GlobalCacheTable::getBucketIndexFor(str_patterns, edit_distance); |
236 | | |
237 | | /// Lock cache to find compiled regexp for given pattern vector + edit distance. |
238 | 0 | std::lock_guard lock(pool.mutex); |
239 | |
|
240 | 0 | GlobalCacheTable::Bucket& bucket = pool.known_regexps[bucket_idx]; |
241 | | |
242 | | /// Pattern compilation is expensive and we don't want to block other threads reading from / inserting into the cache while we hold the |
243 | | /// cache lock during pattern compilation. Therefore, when a cache entry is created or replaced, only set the regexp constructor method |
244 | | /// and compile outside the cache lock. |
245 | | /// Note that the string patterns and the edit distance is passed into the constructor lambda by value, i.e. copied - it is not an |
246 | | /// option to reference the corresponding string patterns / edit distance key in the cache table bucket because the cache entry may |
247 | | /// already be evicted at the time the compilation starts. |
248 | |
|
249 | 0 | if (bucket.regexps == nullptr) { |
250 | | /// insert new entry |
251 | 0 | auto deferred_constructed_regexps = |
252 | 0 | std::make_shared<DeferredConstructedRegexps>([str_patterns, edit_distance]() { |
253 | 0 | return constructRegexps<save_indices, WithEditDistance>(str_patterns, |
254 | 0 | edit_distance); |
255 | 0 | }); |
256 | 0 | bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps}; |
257 | 0 | } else if (bucket.patterns != str_patterns || bucket.edit_distance != edit_distance) { |
258 | | /// replace existing entry |
259 | 0 | auto deferred_constructed_regexps = |
260 | 0 | std::make_shared<DeferredConstructedRegexps>([str_patterns, edit_distance]() { |
261 | 0 | return constructRegexps<save_indices, WithEditDistance>(str_patterns, |
262 | 0 | edit_distance); |
263 | 0 | }); |
264 | 0 | bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps}; |
265 | 0 | } |
266 | |
|
267 | 0 | return bucket.regexps; |
268 | 0 | } |
269 | | |
270 | | } // namespace doris::vectorized::multiregexps |