Coverage Report

Created: 2024-11-21 13:15

/root/doris/be/src/vec/functions/regexps.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/Regexps.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <hs/hs.h>
24
#include <hs/hs_common.h>
25
26
#include <boost/container_hash/hash.hpp>
27
#include <memory>
28
#include <mutex>
29
#include <optional>
30
#include <string>
31
#include <utility>
32
#include <vector>
33
34
#include "common/exception.h"
35
#include "vec/common/string_ref.h"
36
37
namespace doris::vectorized::multiregexps {
38
39
template <typename Deleter, Deleter deleter>
40
struct HyperscanDeleter {
41
    template <typename T>
42
0
    void operator()(T* ptr) const {
43
0
        deleter(ptr);
44
0
    }
Unexecuted instantiation: _ZNK5doris10vectorized12multiregexps16HyperscanDeleterIPFiP16hs_compile_errorEXadL_Z21hs_free_compile_errorEEEclIS3_EEvPT_
Unexecuted instantiation: _ZNK5doris10vectorized12multiregexps16HyperscanDeleterIPFiP11hs_databaseEXadL_Z16hs_free_databaseEEEclIS3_EEvPT_
Unexecuted instantiation: _ZNK5doris10vectorized12multiregexps16HyperscanDeleterIPFiP10hs_scratchEXadL_Z15hs_free_scratchEEEclIS3_EEvPT_
45
};
46
47
/// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception.
48
using CompilerError =
49
        std::unique_ptr<hs_compile_error_t,
50
                        HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
51
using ScratchPtr = std::unique_ptr<hs_scratch_t,
52
                                   HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
53
using DataBasePtr =
54
        std::unique_ptr<hs_database_t,
55
                        HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
56
57
/// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher.
58
class Regexps {
59
public:
60
0
    Regexps(hs_database_t* db_, hs_scratch_t* scratch_) : db {db_}, scratch {scratch_} {}
61
62
0
    hs_database_t* getDB() const { return db.get(); }
63
0
    hs_scratch_t* getScratch() const { return scratch.get(); }
64
65
private:
66
    DataBasePtr db;
67
    ScratchPtr scratch;
68
};
69
70
class DeferredConstructedRegexps {
71
public:
72
    explicit DeferredConstructedRegexps(std::function<Regexps()> constructor_)
73
0
            : constructor(std::move(constructor_)) {}
74
75
0
    Regexps* get() {
76
0
        std::lock_guard lock(mutex);
77
0
        if (regexps) {
78
0
            return &*regexps;
79
0
        }
80
0
        regexps = constructor();
81
0
        return &*regexps;
82
0
    }
83
84
private:
85
    std::mutex mutex;
86
    std::function<Regexps()> constructor;
87
    std::optional<Regexps> regexps;
88
};
89
90
using DeferredConstructedRegexpsPtr = std::shared_ptr<DeferredConstructedRegexps>;
91
92
template <bool save_indices, bool WithEditDistance>
93
Regexps constructRegexps(const std::vector<String>& str_patterns,
94
0
                         [[maybe_unused]] std::optional<UInt32> edit_distance) {
95
    /// Common pointers
96
0
    std::vector<const char*> patterns;
97
0
    std::vector<unsigned int> flags;
98
99
    /// Pointer for external edit distance compilation
100
0
    std::vector<hs_expr_ext> ext_exprs;
101
0
    std::vector<const hs_expr_ext*> ext_exprs_ptrs;
102
103
0
    patterns.reserve(str_patterns.size());
104
0
    flags.reserve(str_patterns.size());
105
106
0
    if constexpr (WithEditDistance) {
107
0
        ext_exprs.reserve(str_patterns.size());
108
0
        ext_exprs_ptrs.reserve(str_patterns.size());
109
0
    }
110
111
0
    for (const auto& ref : str_patterns) {
112
0
        patterns.push_back(ref.data());
113
        /* Flags below are the pattern matching flags.
114
         * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good
115
         * performance practice according to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode
116
         * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match.
117
         * HS_FLAG_UTF8 is a flag where UTF8 literals are matched.
118
         * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice
119
         * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag
120
         */
121
0
        flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8);
122
0
        if constexpr (WithEditDistance) {
123
            /// Hyperscan currently does not support UTF8 matching with edit distance.
124
0
            flags.back() &= ~HS_FLAG_UTF8;
125
0
            ext_exprs.emplace_back();
126
            /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance.
127
0
            ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
128
0
            ext_exprs.back().edit_distance = edit_distance.value();
129
0
            ext_exprs_ptrs.push_back(&ext_exprs.back());
130
0
        }
131
0
    }
132
0
    hs_database_t* db = nullptr;
133
0
    hs_compile_error_t* compile_error = nullptr;
134
135
0
    std::unique_ptr<unsigned int[]> ids;
136
137
    /// We mark the patterns to provide the callback results.
138
0
    if constexpr (save_indices) {
139
0
        ids.reset(new unsigned int[patterns.size()]);
140
0
        for (size_t i = 0; i < patterns.size(); ++i) {
141
0
            ids[i] = static_cast<unsigned>(i + 1);
142
0
        }
143
0
    }
144
145
0
    for (auto& pattern : patterns) {
146
0
        LOG(INFO) << "pattern: " << pattern << "\n";
147
0
    }
148
149
0
    hs_error_t err;
150
0
    if constexpr (!WithEditDistance) {
151
0
        err = hs_compile_multi(patterns.data(), flags.data(), ids.get(),
152
0
                               static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db,
153
0
                               &compile_error);
154
0
    } else {
155
0
        err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.get(), ext_exprs_ptrs.data(),
156
0
                                   static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr,
157
0
                                   &db, &compile_error);
158
0
    }
159
160
0
    if (err != HS_SUCCESS) [[unlikely]] {
161
        /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
162
0
        CompilerError error(compile_error);
163
164
0
        if (error->expression < 0) { // error has nothing to do with the patterns themselves
165
0
            throw doris::Exception(Status::InternalError("Compile regexp expression failed. got {}",
166
0
                                                         error->message));
167
0
        } else {
168
0
            throw doris::Exception(Status::InvalidArgument(
169
0
                    "Compile regexp expression failed. got {}. some expressions may be illegal",
170
0
                    error->message));
171
0
        }
172
0
    }
173
174
    /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
175
    /// function which is faster than allocating scratch space each time in each thread.
176
0
    hs_scratch_t* scratch = nullptr;
177
0
    err = hs_alloc_scratch(db, &scratch);
178
179
0
    if (err != HS_SUCCESS) [[unlikely]] {
180
0
        if (err == HS_NOMEM) [[unlikely]] {
181
0
            throw doris::Exception(Status::MemoryAllocFailed(
182
0
                    "Allocating memory failed on compiling regexp expressions."));
183
0
        } else {
184
0
            throw doris::Exception(Status::InvalidArgument(
185
0
                    "Compile regexp expression failed with unexpected arguments perhaps"));
186
0
        }
187
0
    }
188
189
0
    return {db, scratch};
190
0
}
191
192
/// Maps string pattern vectors + edit distance to compiled vectorscan regexps. Uses the same eviction mechanism as the LocalCacheTable for
193
/// re2 patterns. Because vectorscan regexes are overall more heavy-weight (more expensive compilation, regexes can grow up to multiple
194
/// MBs, usage of scratch space), 1. GlobalCacheTable is a global singleton and, as a result, needs locking 2. the pattern compilation is
195
/// done outside GlobalCacheTable's lock, at the cost of another level of locking.
196
struct GlobalCacheTable {
197
    constexpr static size_t CACHE_SIZE = 500; /// collision probability
198
199
    struct Bucket {
200
        std::vector<String> patterns;        /// key
201
        std::optional<UInt32> edit_distance; /// key
202
        /// The compiled patterns and their state (vectorscan 'database' + scratch space) are wrapped in a shared_ptr. Refcounting guarantees
203
        /// that eviction of a pattern does not affect parallel threads still using the pattern.
204
        DeferredConstructedRegexpsPtr regexps; /// value
205
    };
206
207
    std::mutex mutex;
208
    std::array<Bucket, CACHE_SIZE> known_regexps;
209
210
    static size_t getBucketIndexFor(const std::vector<String> patterns,
211
0
                                    std::optional<UInt32> edit_distance) {
212
0
        size_t hash = 0;
213
0
        for (const auto& pattern : patterns) {
214
0
            boost::hash_combine(hash, pattern);
215
0
        }
216
0
        boost::hash_combine(hash, edit_distance);
217
0
        return hash % CACHE_SIZE;
218
0
    }
219
};
220
221
/// If WithEditDistance is False, edit_distance must be nullopt. Also, we use templates here because each instantiation of function template
222
/// has its own copy of local static variables which must not be the same for different hyperscan compilations.
223
template <bool save_indices, bool WithEditDistance>
224
DeferredConstructedRegexpsPtr getOrSet(const std::vector<StringRef>& patterns,
225
0
                                       std::optional<UInt32> edit_distance) {
226
0
    static GlobalCacheTable
227
0
            pool; /// Different variables for different pattern parameters, thread-safe in C++11
228
229
0
    std::vector<String> str_patterns;
230
0
    str_patterns.reserve(patterns.size());
231
0
    for (const auto& pattern : patterns) {
232
0
        str_patterns.emplace_back(pattern.to_string());
233
0
    }
234
235
0
    size_t bucket_idx = GlobalCacheTable::getBucketIndexFor(str_patterns, edit_distance);
236
237
    /// Lock cache to find compiled regexp for given pattern vector + edit distance.
238
0
    std::lock_guard lock(pool.mutex);
239
240
0
    GlobalCacheTable::Bucket& bucket = pool.known_regexps[bucket_idx];
241
242
    /// Pattern compilation is expensive and we don't want to block other threads reading from / inserting into the cache while we hold the
243
    /// cache lock during pattern compilation. Therefore, when a cache entry is created or replaced, only set the regexp constructor method
244
    /// and compile outside the cache lock.
245
    /// Note that the string patterns and the edit distance is passed into the constructor lambda by value, i.e. copied - it is not an
246
    /// option to reference the corresponding string patterns / edit distance key in the cache table bucket because the cache entry may
247
    /// already be evicted at the time the compilation starts.
248
249
0
    if (bucket.regexps == nullptr) {
250
        /// insert new entry
251
0
        auto deferred_constructed_regexps =
252
0
                std::make_shared<DeferredConstructedRegexps>([str_patterns, edit_distance]() {
253
0
                    return constructRegexps<save_indices, WithEditDistance>(str_patterns,
254
0
                                                                            edit_distance);
255
0
                });
256
0
        bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
257
0
    } else if (bucket.patterns != str_patterns || bucket.edit_distance != edit_distance) {
258
        /// replace existing entry
259
0
        auto deferred_constructed_regexps =
260
0
                std::make_shared<DeferredConstructedRegexps>([str_patterns, edit_distance]() {
261
0
                    return constructRegexps<save_indices, WithEditDistance>(str_patterns,
262
0
                                                                            edit_distance);
263
0
                });
264
0
        bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
265
0
    }
266
267
0
    return bucket.regexps;
268
0
}
269
270
} // namespace doris::vectorized::multiregexps