Coverage Report

Created: 2026-01-16 21:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/vec/functions/like.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <hs/hs_common.h>
21
#include <hs/hs_runtime.h>
22
#include <re2/re2.h>
23
24
#include <algorithm>
25
#include <boost/iterator/iterator_facade.hpp>
26
#include <boost/regex.hpp>
27
#include <cstddef>
28
#include <cstdint>
29
#include <functional>
30
#include <memory>
31
#include <string>
32
33
#include "common/status.h"
34
#include "runtime/define_primitive_type.h"
35
#include "runtime/string_search.hpp"
36
#include "udf/udf.h"
37
#include "vec/aggregate_functions/aggregate_function.h"
38
#include "vec/columns/column_string.h"
39
#include "vec/columns/predicate_column.h"
40
#include "vec/common/string_ref.h"
41
#include "vec/core/column_numbers.h"
42
#include "vec/core/types.h"
43
#include "vec/data_types/data_type_number.h"
44
#include "vec/functions/function.h"
45
46
namespace doris::vectorized {
47
class Block;
48
49
// FastPath types for LIKE pattern matching optimization
50
// This allows per-row pattern analysis to avoid regex when possible
51
enum class LikeFastPath {
52
    ALLPASS,     // Pattern is just '%' or '%%...' - matches everything
53
    EQUALS,      // No wildcards - exact string match
54
    STARTS_WITH, // Pattern ends with '%' only - prefix match
55
    ENDS_WITH,   // Pattern starts with '%' only - suffix match
56
    SUBSTRING,   // Pattern is '%xxx%' - substring search
57
    REGEX        // Contains '_' or multiple '%' - needs regex
58
};
59
60
// Lightweight pattern analysis without RE2
61
// Returns the fast path type and extracts the search string (without wildcards)
62
// Correctly handles escape sequences: backslash-% -> literal %, backslash-_ -> literal _
63
inline LikeFastPath extract_like_fast_path(const char* pattern, size_t len,
64
209
                                           std::string& search_string) {
65
209
    search_string.clear();
66
209
    if (len == 0) {
67
14
        return LikeFastPath::EQUALS;
68
14
    }
69
70
    // Returns true if the character is NOT escaped (even number of preceding backslashes)
71
195
    auto is_unescaped = [&pattern](size_t pos) -> bool {
72
115
        size_t backslash_count = 0;
73
132
        while (pos > 0 && pattern[pos - 1] == '\\') {
74
17
            backslash_count++;
75
17
            pos--;
76
17
        }
77
115
        return (backslash_count % 2 == 0);
78
115
    };
79
80
195
    bool starts_with_percent = (pattern[0] == '%');
81
195
    bool ends_with_percent = (pattern[len - 1] == '%' && is_unescaped(len - 1));
82
83
    // Quick check: if starts or ends with unescaped '_', need regex
84
195
    if (pattern[0] == '_') {
85
19
        return LikeFastPath::REGEX;
86
19
    }
87
176
    if (pattern[len - 1] == '_' && is_unescaped(len - 1)) {
88
26
        return LikeFastPath::REGEX;
89
26
    }
90
91
    // Helper lambda: check if character is a wildcard that needs escaping
92
150
    auto is_wildcard = [](char c) { return c == '%' || c == '_' || c == '\\'; };
93
94
150
    size_t i = 0;
95
    // Skip leading '%' characters (unescaped)
96
209
    while (i < len && pattern[i] == '%') {
97
59
        i++;
98
59
    }
99
    // If pattern is all '%', it's ALLPASS
100
150
    if (i >= len) {
101
6
        return LikeFastPath::ALLPASS;
102
6
    }
103
104
144
    search_string.reserve(len);
105
574
    while (i < len) {
106
519
        char c = pattern[i];
107
        // Escaped character - add the literal
108
519
        if (c == '\\' && i + 1 < len && is_wildcard(pattern[i + 1])) {
109
31
            search_string.push_back(pattern[i + 1]);
110
31
            i += 2;
111
31
            continue;
112
31
        }
113
114
        // Unescaped '_' requires regex
115
488
        if (c == '_') {
116
33
            return LikeFastPath::REGEX;
117
33
        }
118
119
        // Check for trailing '%' or middle '%' (which needs regex)
120
455
        if (c == '%') {
121
            // Check if this is a trailing '%' sequence
122
56
            size_t j = i;
123
112
            while (j < len && pattern[j] == '%') {
124
56
                j++;
125
56
            }
126
56
            if (j >= len) {
127
                // All remaining chars are '%', we're done parsing
128
44
                break;
129
44
            }
130
            // '%' in the middle with more content after - need regex
131
12
            return LikeFastPath::REGEX;
132
56
        }
133
134
399
        search_string.push_back(c);
135
399
        i++;
136
399
    }
137
138
    // Determine the pattern type based on '%' positions
139
99
    if (starts_with_percent && ends_with_percent) {
140
28
        return LikeFastPath::SUBSTRING;
141
71
    } else if (starts_with_percent) {
142
10
        return LikeFastPath::ENDS_WITH;
143
61
    } else if (ends_with_percent) {
144
16
        return LikeFastPath::STARTS_WITH;
145
45
    } else {
146
45
        return LikeFastPath::EQUALS;
147
45
    }
148
99
}
149
150
8
inline std::string replace_pattern_by_escape(const StringRef& pattern, char escape_char) {
151
8
    std::string result;
152
8
    result.reserve(pattern.size);
153
59
    for (size_t i = 0; i < pattern.size; ++i) {
154
51
        if (i + 1 < pattern.size && pattern.data[i] == escape_char &&
155
51
            (pattern.data[i + 1] == escape_char || pattern.data[i + 1] == '%' ||
156
13
             pattern.data[i + 1] == '_')) {
157
            // "^^" -> "^"
158
            // "^%" -> "\%"
159
            // "^_" -> "\_"
160
10
            if ((pattern.data[i + 1] == '%' || pattern.data[i + 1] == '_')) {
161
4
                result.push_back('\\');
162
4
            }
163
10
            result.push_back(pattern.data[i + 1]);
164
10
            ++i; // skip next char
165
41
        } else if (pattern.data[i] == '\\') {
166
            // "\" -> "\\"
167
1
            result.append("\\\\");
168
40
        } else {
169
40
            result.push_back(pattern.data[i]);
170
40
        }
171
51
    }
172
8
    return result;
173
8
}
174
175
// TODO: replace with std::string_view when `LikeSearchState.substring_pattern` can
176
// construct from std::string_view.
177
struct LikeSearchState {
178
    static constexpr char escape_char = '\\';
179
180
    /// Holds the string the StringRef points to and is set any time StringRef is
181
    /// used.
182
    std::string search_string;
183
184
    std::string pattern_str;
185
186
    /// Used for LIKE predicates if the pattern is a constant argument, and is either a
187
    /// constant string or has a constant string at the beginning or end of the pattern.
188
    /// This will be set in order to check for that pattern in the corresponding part of
189
    /// the string.
190
    StringRef search_string_sv;
191
192
    /// Used for LIKE predicates if the pattern is a constant argument and has a constant
193
    /// string in the middle of it. This will be use in order to check for the substring
194
    /// in the value.
195
    doris::StringSearch substring_pattern;
196
197
    /// Used for RLIKE and REGEXP predicates if the pattern is a constant argument.
198
    std::unique_ptr<re2::RE2> regex;
199
200
    /// Used for REGEXP predicates when RE2 doesn't support the pattern (e.g., zero-width assertions like `?=`, `?!`, `?<=`, `?<!`)
201
    std::unique_ptr<boost::regex> boost_regex;
202
203
    template <typename Deleter, Deleter deleter>
204
    struct HyperscanDeleter {
205
        template <typename T>
206
264
        void operator()(T* ptr) const {
207
264
            deleter(ptr);
208
264
        }
_ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP11hs_databaseEXadL_Z16hs_free_databaseEEEclIS3_EEvPT_
Line
Count
Source
206
132
        void operator()(T* ptr) const {
207
132
            deleter(ptr);
208
132
        }
_ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP10hs_scratchEXadL_Z15hs_free_scratchEEEclIS3_EEvPT_
Line
Count
Source
206
132
        void operator()(T* ptr) const {
207
132
            deleter(ptr);
208
132
        }
209
    };
210
211
    // hyperscan compiled pattern database and scratch space, reused for performance
212
    std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>
213
            hs_database;
214
    std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>
215
            hs_scratch;
216
217
    // hyperscan match callback
218
    static int hs_match_handler(unsigned int /* from */,       // NOLINT
219
                                unsigned long long /* from */, // NOLINT
220
                                unsigned long long /* to */,   // NOLINT
221
71
                                unsigned int /* flags */, void* ctx) {
222
        // set result to 1 for matched row
223
71
        *((unsigned char*)ctx) = 1;
224
        /// return non-zero to indicate hyperscan stop after first matched
225
71
        return 1;
226
71
    }
227
228
597
    LikeSearchState() = default;
229
230
    Status clone(LikeSearchState& cloned);
231
232
276
    void set_search_string(const std::string& search_string_arg) {
233
276
        search_string = search_string_arg;
234
276
        search_string_sv = StringRef(search_string);
235
276
        substring_pattern.set_pattern(&search_string_sv);
236
276
    }
237
};
238
239
using LikeFn = std::function<doris::Status(const LikeSearchState*, const ColumnString&,
240
                                           const StringRef&, ColumnUInt8::Container&)>;
241
242
using ScalarLikeFn = std::function<doris::Status(const LikeSearchState*, const StringRef&,
243
                                                 const StringRef&, unsigned char*)>;
244
245
using VectorLikeFn = std::function<doris::Status(const ColumnString&, const ColumnString&,
246
                                                 ColumnUInt8::Container&)>;
247
248
struct LikeState {
249
    bool is_like_pattern;
250
    bool has_custom_escape = false;
251
    char escape_char = {};
252
    LikeSearchState search_state;
253
    LikeFn function;
254
    ScalarLikeFn scalar_function;
255
};
256
257
struct VectorPatternSearchState {
258
    MutableColumnPtr _search_strings;
259
    std::string _search_string;
260
    VectorLikeFn _vector_function;
261
    bool _pattern_matched;
262
263
    VectorPatternSearchState(VectorLikeFn vector_function)
264
1.53k
            : _search_strings(ColumnString::create()),
265
1.53k
              _vector_function(vector_function),
266
1.53k
              _pattern_matched(true) {}
267
268
1.53k
    virtual ~VectorPatternSearchState() = default;
269
270
    virtual void like_pattern_match(const std::string& pattern_str) = 0;
271
272
    virtual void regexp_pattern_match(const std::string& pattern_str) = 0;
273
};
274
275
using VPatternSearchStateSPtr = std::shared_ptr<VectorPatternSearchState>;
276
277
class FunctionLikeBase : public IFunction {
278
public:
279
0
    size_t get_number_of_arguments() const override { return 0; }
280
601
    bool is_variadic() const override { return true; }
281
282
599
    DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override {
283
599
        return std::make_shared<DataTypeUInt8>();
284
599
    }
285
286
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
287
                        uint32_t result, size_t /*input_rows_count*/) const override;
288
289
    friend struct VectorAllpassSearchState;
290
    friend struct VectorEqualSearchState;
291
    friend struct VectorSubStringSearchState;
292
    friend struct VectorStartsWithSearchState;
293
    friend struct VectorEndsWithSearchState;
294
295
protected:
296
    Status vector_const(const ColumnString& values, const StringRef* pattern_val,
297
                        ColumnUInt8::Container& result, const LikeFn& function,
298
                        LikeSearchState* search_state) const;
299
300
    Status vector_non_const(const ColumnString& values, const ColumnString& patterns,
301
                            ColumnUInt8::Container& result, LikeState* state,
302
                            size_t input_rows_count) const;
303
304
    Status execute_substring(const ColumnString::Chars& values,
305
                             const ColumnString::Offsets& value_offsets,
306
                             ColumnUInt8::Container& result, LikeSearchState* search_state) const;
307
308
    template <bool LIKE_PATTERN>
309
    static VPatternSearchStateSPtr pattern_type_recognition(const ColumnString& patterns);
310
311
    static Status constant_allpass_fn(const LikeSearchState* state, const ColumnString& val,
312
                                      const StringRef& pattern, ColumnUInt8::Container& result);
313
314
    static Status constant_allpass_fn_scalar(const LikeSearchState* state, const StringRef& val,
315
                                             const StringRef& pattern, unsigned char* result);
316
317
    static Status vector_allpass_fn(const ColumnString& vals, const ColumnString& search_strings,
318
                                    ColumnUInt8::Container& result);
319
320
    static Status constant_starts_with_fn(const LikeSearchState* state, const ColumnString& val,
321
                                          const StringRef& pattern, ColumnUInt8::Container& result);
322
323
    static Status constant_starts_with_fn_scalar(const LikeSearchState* state, const StringRef& val,
324
                                                 const StringRef& pattern, unsigned char* result);
325
326
    static Status vector_starts_with_fn(const ColumnString& vals,
327
                                        const ColumnString& search_strings,
328
                                        ColumnUInt8::Container& result);
329
330
    static Status constant_ends_with_fn(const LikeSearchState* state, const ColumnString& val,
331
                                        const StringRef& pattern, ColumnUInt8::Container& result);
332
333
    static Status constant_ends_with_fn_scalar(const LikeSearchState* state, const StringRef& val,
334
                                               const StringRef& pattern, unsigned char* result);
335
336
    static Status vector_ends_with_fn(const ColumnString& vals, const ColumnString& search_strings,
337
                                      ColumnUInt8::Container& result);
338
339
    static Status constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
340
                                     const StringRef& pattern, ColumnUInt8::Container& result);
341
342
    static Status constant_equals_fn_scalar(const LikeSearchState* state, const StringRef& val,
343
                                            const StringRef& pattern, unsigned char* result);
344
345
    static Status vector_equals_fn(const ColumnString& vals, const ColumnString& search_strings,
346
                                   ColumnUInt8::Container& result);
347
348
    static Status constant_substring_fn(const LikeSearchState* state, const ColumnString& val,
349
                                        const StringRef& pattern, ColumnUInt8::Container& result);
350
351
    static Status constant_substring_fn_scalar(const LikeSearchState* state, const StringRef& val,
352
                                               const StringRef& pattern, unsigned char* result);
353
354
    static Status vector_substring_fn(const ColumnString& vals, const ColumnString& search_strings,
355
                                      ColumnUInt8::Container& result);
356
357
    static Status constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
358
                                    const StringRef& pattern, ColumnUInt8::Container& result);
359
360
    static Status constant_regex_fn_scalar(const LikeSearchState* state, const StringRef& val,
361
                                           const StringRef& pattern, unsigned char* result);
362
363
    static Status regexp_fn(const LikeSearchState* state, const ColumnString& val,
364
                            const StringRef& pattern, ColumnUInt8::Container& result);
365
366
    static Status regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
367
                                   const StringRef& pattern, unsigned char* result);
368
369
    // hyperscan compile expression to database and allocate scratch space
370
    static Status hs_prepare(FunctionContext* context, const char* expression,
371
                             hs_database_t** database, hs_scratch_t** scratch);
372
};
373
374
class FunctionLike : public FunctionLikeBase {
375
public:
376
    static constexpr auto name = "like";
377
378
540
    static FunctionPtr create() { return std::make_shared<FunctionLike>(); }
379
380
0
    String get_name() const override { return name; }
381
382
    Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
383
384
    static Status construct_like_const_state(FunctionContext* ctx, const StringRef& pattern,
385
                                             std::shared_ptr<LikeState>& state,
386
                                             bool try_hyperscan = true);
387
388
    friend struct LikeSearchState;
389
    friend struct VectorAllpassSearchState;
390
    friend struct VectorEqualSearchState;
391
    friend struct VectorSubStringSearchState;
392
    friend struct VectorStartsWithSearchState;
393
    friend struct VectorEndsWithSearchState;
394
395
private:
396
    static Status like_fn(const LikeSearchState* state, const ColumnString& val,
397
                          const StringRef& pattern, ColumnUInt8::Container& result);
398
399
    static Status like_fn_scalar(const LikeSearchState* state, const StringRef& val,
400
                                 const StringRef& pattern, unsigned char* result);
401
402
    static void convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
403
                                     std::string* re_pattern);
404
405
    static void remove_escape_character(std::string* search_string);
406
};
407
408
class FunctionRegexpLike : public FunctionLikeBase {
409
public:
410
    static constexpr auto name = "regexp";
411
    static constexpr auto alias = "rlike";
412
413
63
    static FunctionPtr create() { return std::make_shared<FunctionRegexpLike>(); }
414
415
0
    String get_name() const override { return name; }
416
417
    Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
418
};
419
420
} // namespace doris::vectorized