Coverage Report

Created: 2026-03-13 10:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "core/block/block.h"
31
#include "core/block/column_with_type_and_name.h"
32
#include "core/column/column.h"
33
#include "core/column/column_const.h"
34
#include "core/column/column_vector.h"
35
#include "core/string_ref.h"
36
#include "exprs/function/simple_function_factory.h"
37
38
namespace doris {
39
#include "common/compile_check_begin.h"
40
// A regex to match any regex pattern is equivalent to a substring search.
41
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
42
43
// A regex to match any regex pattern which is equivalent to matching a constant string
44
// at the end of the string values.
45
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
46
47
// A regex to match any regex pattern which is equivalent to matching a constant string
48
// at the end of the string values.
49
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
50
51
// A regex to match any regex pattern which is equivalent to a constant string match.
52
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
53
// A regex to match .*
54
static const RE2 ALLPASS_RE(R"((\.\*)+)");
55
56
// Like patterns
57
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
58
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
59
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
60
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
61
static const re2::RE2 LIKE_ALLPASS_RE("%+");
62
63
struct VectorAllpassSearchState : public VectorPatternSearchState {
64
452
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
65
66
    ~VectorAllpassSearchState() override = default;
67
68
425
    void like_pattern_match(const std::string& pattern_str) override {
69
425
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
70
26
            _search_strings->insert_default();
71
399
        } else {
72
399
            _pattern_matched = false;
73
399
        }
74
425
    }
75
76
32
    void regexp_pattern_match(const std::string& pattern_str) override {
77
32
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
78
0
            _search_strings->insert_default();
79
32
        } else {
80
32
            _pattern_matched = false;
81
32
        }
82
32
    }
83
};
84
85
struct VectorEqualSearchState : public VectorPatternSearchState {
86
452
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
87
88
    ~VectorEqualSearchState() override = default;
89
90
451
    void like_pattern_match(const std::string& pattern_str) override {
91
451
        _search_string.clear();
92
451
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
93
119
            FunctionLike::remove_escape_character(&_search_string);
94
119
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
95
332
        } else {
96
332
            _pattern_matched = false;
97
332
        }
98
451
    }
99
100
32
    void regexp_pattern_match(const std::string& pattern_str) override {
101
32
        _search_string.clear();
102
32
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
103
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
104
32
        } else {
105
32
            _pattern_matched = false;
106
32
        }
107
32
    }
108
};
109
110
struct VectorSubStringSearchState : public VectorPatternSearchState {
111
    VectorSubStringSearchState()
112
452
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
113
114
    ~VectorSubStringSearchState() override = default;
115
116
442
    void like_pattern_match(const std::string& pattern_str) override {
117
442
        _search_string.clear();
118
442
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
119
53
            FunctionLike::remove_escape_character(&_search_string);
120
53
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
121
389
        } else {
122
389
            _pattern_matched = false;
123
389
        }
124
442
    }
125
126
55
    void regexp_pattern_match(const std::string& pattern_str) override {
127
55
        _search_string.clear();
128
55
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
129
34
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
130
34
        } else {
131
21
            _pattern_matched = false;
132
21
        }
133
55
    }
134
};
135
136
struct VectorStartsWithSearchState : public VectorPatternSearchState {
137
    VectorStartsWithSearchState()
138
452
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
139
140
    ~VectorStartsWithSearchState() override = default;
141
142
432
    void like_pattern_match(const std::string& pattern_str) override {
143
432
        _search_string.clear();
144
432
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
145
83
            FunctionLike::remove_escape_character(&_search_string);
146
83
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
147
349
        } else {
148
349
            _pattern_matched = false;
149
349
        }
150
432
    }
151
152
50
    void regexp_pattern_match(const std::string& pattern_str) override {
153
50
        _search_string.clear();
154
50
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
155
23
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
156
27
        } else {
157
27
            _pattern_matched = false;
158
27
        }
159
50
    }
160
};
161
162
struct VectorEndsWithSearchState : public VectorPatternSearchState {
163
452
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
164
165
    ~VectorEndsWithSearchState() override = default;
166
167
431
    void like_pattern_match(const std::string& pattern_str) override {
168
431
        _search_string.clear();
169
431
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
170
42
            FunctionLike::remove_escape_character(&_search_string);
171
42
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
172
389
        } else {
173
389
            _pattern_matched = false;
174
389
        }
175
431
    }
176
177
38
    void regexp_pattern_match(const std::string& pattern_str) override {
178
38
        _search_string.clear();
179
38
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
180
9
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
181
29
        } else {
182
29
            _pattern_matched = false;
183
29
        }
184
38
    }
185
};
186
187
589
Status LikeSearchState::clone(LikeSearchState& cloned) {
188
589
    cloned.set_search_string(search_string);
189
190
589
    std::string re_pattern;
191
589
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
192
589
    if (hs_database) { // use hyperscan
193
178
        hs_database_t* database = nullptr;
194
178
        hs_scratch_t* scratch = nullptr;
195
178
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
196
197
178
        cloned.hs_database.reset(database);
198
178
        cloned.hs_scratch.reset(scratch);
199
411
    } else { // fallback to re2
200
411
        cloned.hs_database.reset();
201
411
        cloned.hs_scratch.reset();
202
203
411
        RE2::Options opts;
204
411
        opts.set_never_nl(false);
205
411
        opts.set_dot_nl(true);
206
411
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
207
411
        if (!cloned.regex->ok()) {
208
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
209
0
        }
210
411
    }
211
212
589
    return Status::OK();
213
589
}
214
215
Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals,
216
                                             const StringRef& pattern,
217
49
                                             ColumnUInt8::Container& result) {
218
49
    memset(result.data(), 1, vals.size());
219
49
    return Status::OK();
220
49
}
221
222
Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state,
223
                                                    const StringRef& val, const StringRef& pattern,
224
81
                                                    unsigned char* result) {
225
81
    *result = 1;
226
81
    return Status::OK();
227
81
}
228
229
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
230
                                           const ColumnString& search_strings,
231
21
                                           ColumnUInt8::Container& result) {
232
21
    DCHECK(vals.size() == search_strings.size());
233
21
    DCHECK(vals.size() == result.size());
234
21
    memset(result.data(), 1, vals.size());
235
21
    return Status::OK();
236
21
}
237
238
Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state,
239
                                                 const ColumnString& val, const StringRef& pattern,
240
990
                                                 ColumnUInt8::Container& result) {
241
990
    auto sz = val.size();
242
107k
    for (size_t i = 0; i < sz; i++) {
243
106k
        const auto& str_ref = val.get_data_at(i);
244
106k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
245
106k
                    str_ref.start_with(state->search_string_sv);
246
106k
    }
247
990
    return Status::OK();
248
990
}
249
250
Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state,
251
                                                        const StringRef& val,
252
                                                        const StringRef& pattern,
253
39.3k
                                                        unsigned char* result) {
254
39.3k
    *result = (val.size >= state->search_string_sv.size) &&
255
39.4k
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
256
39.3k
    return Status::OK();
257
39.3k
}
258
259
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
260
                                               const ColumnString& search_strings,
261
76
                                               ColumnUInt8::Container& result) {
262
76
    DCHECK(vals.size() == search_strings.size());
263
76
    DCHECK(vals.size() == result.size());
264
76
    auto sz = vals.size();
265
174
    for (size_t i = 0; i < sz; ++i) {
266
98
        const auto& str_sv = vals.get_data_at(i);
267
98
        const auto& search_string_sv = search_strings.get_data_at(i);
268
98
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
269
98
    }
270
76
    return Status::OK();
271
76
}
272
273
Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state,
274
                                               const ColumnString& val, const StringRef& pattern,
275
102
                                               ColumnUInt8::Container& result) {
276
102
    auto sz = val.size();
277
20.7k
    for (size_t i = 0; i < sz; i++) {
278
20.6k
        const auto& str_ref = val.get_data_at(i);
279
20.6k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
280
20.6k
                    str_ref.end_with(state->search_string_sv);
281
20.6k
    }
282
102
    return Status::OK();
283
102
}
284
285
Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state,
286
                                                      const StringRef& val,
287
                                                      const StringRef& pattern,
288
1.90k
                                                      unsigned char* result) {
289
1.90k
    *result = (val.size >= state->search_string_sv.size) &&
290
1.90k
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
291
1.89k
                                                        state->search_string_sv.size));
292
1.90k
    return Status::OK();
293
1.90k
}
294
295
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
296
                                             const ColumnString& search_strings,
297
34
                                             ColumnUInt8::Container& result) {
298
34
    DCHECK(vals.size() == search_strings.size());
299
34
    DCHECK(vals.size() == result.size());
300
34
    auto sz = vals.size();
301
78
    for (size_t i = 0; i < sz; ++i) {
302
44
        const auto& str_sv = vals.get_data_at(i);
303
44
        const auto& search_string_sv = search_strings.get_data_at(i);
304
44
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
305
44
    }
306
34
    return Status::OK();
307
34
}
308
309
Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
310
                                            const StringRef& pattern,
311
50
                                            ColumnUInt8::Container& result) {
312
50
    auto sz = val.size();
313
127
    for (size_t i = 0; i < sz; i++) {
314
77
        result[i] = (val.get_data_at(i) == state->search_string_sv);
315
77
    }
316
50
    return Status::OK();
317
50
}
318
319
Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state,
320
                                                   const StringRef& val, const StringRef& pattern,
321
150
                                                   unsigned char* result) {
322
150
    *result = (val == state->search_string_sv);
323
150
    return Status::OK();
324
150
}
325
326
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
327
                                          const ColumnString& search_strings,
328
88
                                          ColumnUInt8::Container& result) {
329
88
    DCHECK(vals.size() == search_strings.size());
330
88
    DCHECK(vals.size() == result.size());
331
88
    auto sz = vals.size();
332
197
    for (size_t i = 0; i < sz; ++i) {
333
109
        const auto& str_sv = vals.get_data_at(i);
334
109
        const auto& search_string_sv = search_strings.get_data_at(i);
335
109
        result[i] = str_sv == search_string_sv;
336
109
    }
337
88
    return Status::OK();
338
88
}
339
340
Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state,
341
                                               const ColumnString& val, const StringRef& pattern,
342
0
                                               ColumnUInt8::Container& result) {
343
0
    auto sz = val.size();
344
0
    for (size_t i = 0; i < sz; i++) {
345
0
        if (state->search_string_sv.size == 0) {
346
0
            result[i] = true;
347
0
            continue;
348
0
        }
349
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
350
0
    }
351
0
    return Status::OK();
352
0
}
353
354
Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state,
355
                                                      const StringRef& val,
356
                                                      const StringRef& pattern,
357
115k
                                                      unsigned char* result) {
358
115k
    if (state->search_string_sv.size == 0) {
359
0
        *result = true;
360
0
        return Status::OK();
361
0
    }
362
115k
    *result = state->substring_pattern.search(val) != -1;
363
115k
    return Status::OK();
364
115k
}
365
366
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
367
                                             const ColumnString& search_strings,
368
42
                                             ColumnUInt8::Container& result) {
369
42
    DCHECK(vals.size() == search_strings.size());
370
42
    DCHECK(vals.size() == result.size());
371
42
    auto sz = vals.size();
372
108
    for (size_t i = 0; i < sz; ++i) {
373
66
        const auto& str_sv = vals.get_data_at(i);
374
66
        const auto& search_string_sv = search_strings.get_data_at(i);
375
66
        if (search_string_sv.size == 0) {
376
2
            result[i] = true;
377
2
            continue;
378
2
        }
379
64
        doris::StringSearch substring_search(&search_string_sv);
380
64
        result[i] = substring_search.search(str_sv) != -1;
381
64
    }
382
42
    return Status::OK();
383
42
}
384
385
Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state,
386
                                                  const StringRef& val, const StringRef& pattern,
387
4.05k
                                                  unsigned char* result) {
388
4.05k
    if (state->hs_database) { // use hyperscan
389
3.87k
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
390
3.87k
                           state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
391
3.87k
                           (void*)result);
392
3.87k
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
393
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
394
0
        }
395
3.87k
    } else if (state->boost_regex) { // use boost::regex for advanced features
396
4
        *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex);
397
177
    } else { // fallback to re2
398
177
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
399
177
    }
400
401
4.05k
    return Status::OK();
402
4.05k
}
403
404
Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
405
104
                                          const StringRef& pattern, unsigned char* result) {
406
104
    RE2::Options opts;
407
104
    opts.set_never_nl(false);
408
104
    opts.set_dot_nl(true);
409
104
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
410
104
    if (re.ok()) {
411
104
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
412
104
    } else {
413
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
414
0
    }
415
416
104
    return Status::OK();
417
104
}
418
419
Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
420
                                           const StringRef& pattern,
421
642
                                           ColumnUInt8::Container& result) {
422
642
    auto sz = val.size();
423
642
    if (state->hs_database) { // use hyperscan
424
741k
        for (size_t i = 0; i < sz; i++) {
425
741k
            const auto& str_ref = val.get_data_at(i);
426
741k
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
427
741k
                               state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
428
741k
                               (void*)(result.data() + i));
429
741k
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
430
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
431
0
            }
432
741k
        }
433
639
    } else if (state->boost_regex) { // use boost::regex for advanced features
434
0
        for (size_t i = 0; i < sz; i++) {
435
0
            const auto& str_ref = val.get_data_at(i);
436
0
            *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size,
437
0
                                                       *state->boost_regex);
438
0
        }
439
3
    } else { // fallback to re2
440
7
        for (size_t i = 0; i < sz; i++) {
441
4
            const auto& str_ref = val.get_data_at(i);
442
4
            *(result.data() + i) =
443
4
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
444
4
        }
445
3
    }
446
447
642
    return Status::OK();
448
642
}
449
450
Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val,
451
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
452
0
    std::string re_pattern(pattern.data, pattern.size);
453
454
0
    hs_database_t* database = nullptr;
455
0
    hs_scratch_t* scratch = nullptr;
456
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
457
0
        auto sz = val.size();
458
0
        for (size_t i = 0; i < sz; i++) {
459
0
            const auto& str_ref = val.get_data_at(i);
460
0
            auto ret =
461
0
                    hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
462
0
                            doris::LikeSearchState::hs_match_handler, (void*)(result.data() + i));
463
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
464
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
465
0
            }
466
0
        }
467
468
0
        hs_free_scratch(scratch);
469
0
        hs_free_database(database);
470
0
    } else { // fallback to re2
471
0
        RE2::Options opts;
472
0
        opts.set_never_nl(false);
473
0
        opts.set_dot_nl(true);
474
0
        re2::RE2 re(re_pattern, opts);
475
0
        if (re.ok()) {
476
0
            auto sz = val.size();
477
0
            for (size_t i = 0; i < sz; i++) {
478
0
                const auto& str_ref = val.get_data_at(i);
479
0
                *(result.data() + i) =
480
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
481
0
            }
482
0
        } else {
483
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
484
0
        }
485
0
    }
486
487
0
    return Status::OK();
488
0
}
489
490
// hyperscan compile expression to database and allocate scratch space
491
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
492
1.51k
                                    hs_database_t** database, hs_scratch_t** scratch) {
493
1.51k
    hs_compile_error_t* compile_err;
494
1.51k
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
495
1.51k
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
496
497
1.51k
    if (res != HS_SUCCESS) {
498
19
        *database = nullptr;
499
19
        std::string error_message = compile_err->message;
500
19
        hs_free_compile_error(compile_err);
501
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
502
19
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
503
19
    }
504
1.49k
    hs_free_compile_error(compile_err);
505
506
1.49k
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
507
0
        hs_free_database(*database);
508
0
        *database = nullptr;
509
0
        *scratch = nullptr;
510
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
511
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
512
0
    }
513
514
1.49k
    return Status::OK();
515
1.49k
}
516
517
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
518
                                      const ColumnNumbers& arguments, uint32_t result,
519
3.85k
                                      size_t input_rows_count) const {
520
3.85k
    const auto values_col =
521
3.85k
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
522
3.85k
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
523
524
3.85k
    if (!values) {
525
0
        return Status::InternalError("Not supported input arguments types");
526
0
    }
527
    // result column
528
3.85k
    auto res = ColumnUInt8::create();
529
3.85k
    ColumnUInt8::Container& vec_res = res->get_data();
530
    // set default value to 0, and match functions only need to set 1/true
531
3.85k
    vec_res.resize_fill(input_rows_count);
532
3.85k
    auto* state = reinterpret_cast<LikeState*>(
533
3.85k
            context->get_function_state(FunctionContext::THREAD_LOCAL));
534
    // for constant_substring_fn, use long run length search for performance
535
3.85k
    if (constant_substring_fn ==
536
3.85k
        *(state->function
537
3.85k
                  .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&,
538
3.85k
                                            const StringRef&, ColumnUInt8::Container&)>())) {
539
1.57k
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
540
1.57k
                                          &state->search_state));
541
2.28k
    } else {
542
2.28k
        const auto pattern_col = block.get_by_position(arguments[1]).column;
543
2.28k
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
544
452
            RETURN_IF_ERROR(
545
452
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
546
1.83k
        } else if (const auto* const_patterns =
547
1.83k
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
548
1.83k
            const auto& pattern_val = const_patterns->get_data_at(0);
549
1.83k
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
550
1.83k
                                         &state->search_state));
551
18.4E
        } else {
552
18.4E
            return Status::InternalError("Not supported input arguments types");
553
18.4E
        }
554
2.28k
    }
555
3.85k
    block.replace_by_position(result, std::move(res));
556
3.85k
    return Status::OK();
557
3.85k
}
558
559
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
560
                                           const ColumnString::Offsets& value_offsets,
561
                                           ColumnUInt8::Container& result,
562
1.57k
                                           LikeSearchState* search_state) const {
563
    // treat continuous multi string data as a long string data
564
1.57k
    const UInt8* begin = values.data();
565
1.57k
    const UInt8* end = begin + values.size();
566
1.57k
    const UInt8* pos = begin;
567
568
    /// Current index in the array of strings.
569
1.57k
    size_t i = 0;
570
1.57k
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
571
572
    /// We will search for the next occurrence in all strings at once.
573
240k
    while (pos < end) {
574
        // search return matched substring start offset
575
240k
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
576
240k
        if (pos >= end) {
577
1.33k
            break;
578
1.33k
        }
579
580
        /// Determine which index it refers to.
581
        /// begin + value_offsets[i] is the start offset of string at i+1
582
2.49M
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
583
2.25M
            ++i;
584
2.25M
        }
585
586
        /// We check that the entry does not pass through the boundaries of strings.
587
239k
        if (pos + needle_size <= begin + value_offsets[i]) {
588
214k
            result[i] = 1;
589
214k
        }
590
591
        // move to next string offset
592
239k
        pos = begin + value_offsets[i];
593
239k
        ++i;
594
239k
    }
595
596
1.57k
    return Status::OK();
597
1.57k
}
598
599
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
600
                                      ColumnUInt8::Container& result, const LikeFn& function,
601
1.83k
                                      LikeSearchState* search_state) const {
602
1.83k
    RETURN_IF_ERROR((function)(search_state, values,
603
1.83k
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
604
1.83k
    return Status::OK();
605
1.83k
}
606
607
template <bool LIKE_PATTERN>
608
452
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
452
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
452
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
452
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
452
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
452
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
452
    size_t size = patterns.size();
615
616
1.03k
    for (size_t i = 0; i < size; ++i) {
617
592
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
592
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
592
            !ends_with_state->_pattern_matched) {
620
12
            return nullptr;
621
12
        }
622
580
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
580
        if (allpass_state->_pattern_matched) {
624
457
            if constexpr (LIKE_PATTERN) {
625
425
                allpass_state->like_pattern_match(pattern_str);
626
425
            } else {
627
32
                allpass_state->regexp_pattern_match(pattern_str);
628
32
            }
629
457
        }
630
580
        if (equal_state->_pattern_matched) {
631
483
            if constexpr (LIKE_PATTERN) {
632
451
                equal_state->like_pattern_match(pattern_str);
633
451
            } else {
634
32
                equal_state->regexp_pattern_match(pattern_str);
635
32
            }
636
483
        }
637
580
        if (substring_state->_pattern_matched) {
638
497
            if constexpr (LIKE_PATTERN) {
639
442
                substring_state->like_pattern_match(pattern_str);
640
442
            } else {
641
55
                substring_state->regexp_pattern_match(pattern_str);
642
55
            }
643
497
        }
644
580
        if (starts_with_state->_pattern_matched) {
645
482
            if constexpr (LIKE_PATTERN) {
646
432
                starts_with_state->like_pattern_match(pattern_str);
647
432
            } else {
648
50
                starts_with_state->regexp_pattern_match(pattern_str);
649
50
            }
650
482
        }
651
580
        if (ends_with_state->_pattern_matched) {
652
469
            if constexpr (LIKE_PATTERN) {
653
431
                ends_with_state->like_pattern_match(pattern_str);
654
431
            } else {
655
38
                ends_with_state->regexp_pattern_match(pattern_str);
656
38
            }
657
469
        }
658
580
    }
659
660
440
    if (allpass_state->_pattern_matched) {
661
21
        return allpass_state;
662
419
    } else if (equal_state->_pattern_matched) {
663
88
        return equal_state;
664
331
    } else if (substring_state->_pattern_matched) {
665
42
        return substring_state;
666
289
    } else if (starts_with_state->_pattern_matched) {
667
76
        return starts_with_state;
668
213
    } else if (ends_with_state->_pattern_matched) {
669
34
        return ends_with_state;
670
179
    } else {
671
179
        return nullptr;
672
179
    }
673
440
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
608
420
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
420
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
420
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
420
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
420
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
420
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
420
    size_t size = patterns.size();
615
616
921
    for (size_t i = 0; i < size; ++i) {
617
513
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
513
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
513
            !ends_with_state->_pattern_matched) {
620
12
            return nullptr;
621
12
        }
622
501
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
501
        if (allpass_state->_pattern_matched) {
624
425
            if constexpr (LIKE_PATTERN) {
625
425
                allpass_state->like_pattern_match(pattern_str);
626
            } else {
627
                allpass_state->regexp_pattern_match(pattern_str);
628
            }
629
425
        }
630
501
        if (equal_state->_pattern_matched) {
631
451
            if constexpr (LIKE_PATTERN) {
632
451
                equal_state->like_pattern_match(pattern_str);
633
            } else {
634
                equal_state->regexp_pattern_match(pattern_str);
635
            }
636
451
        }
637
501
        if (substring_state->_pattern_matched) {
638
442
            if constexpr (LIKE_PATTERN) {
639
442
                substring_state->like_pattern_match(pattern_str);
640
            } else {
641
                substring_state->regexp_pattern_match(pattern_str);
642
            }
643
442
        }
644
501
        if (starts_with_state->_pattern_matched) {
645
432
            if constexpr (LIKE_PATTERN) {
646
432
                starts_with_state->like_pattern_match(pattern_str);
647
            } else {
648
                starts_with_state->regexp_pattern_match(pattern_str);
649
            }
650
432
        }
651
501
        if (ends_with_state->_pattern_matched) {
652
431
            if constexpr (LIKE_PATTERN) {
653
431
                ends_with_state->like_pattern_match(pattern_str);
654
            } else {
655
                ends_with_state->regexp_pattern_match(pattern_str);
656
            }
657
431
        }
658
501
    }
659
660
408
    if (allpass_state->_pattern_matched) {
661
21
        return allpass_state;
662
387
    } else if (equal_state->_pattern_matched) {
663
88
        return equal_state;
664
299
    } else if (substring_state->_pattern_matched) {
665
31
        return substring_state;
666
268
    } else if (starts_with_state->_pattern_matched) {
667
71
        return starts_with_state;
668
197
    } else if (ends_with_state->_pattern_matched) {
669
31
        return ends_with_state;
670
166
    } else {
671
166
        return nullptr;
672
166
    }
673
408
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
608
32
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
32
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
32
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
32
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
32
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
32
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
32
    size_t size = patterns.size();
615
616
111
    for (size_t i = 0; i < size; ++i) {
617
79
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
79
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
79
            !ends_with_state->_pattern_matched) {
620
0
            return nullptr;
621
0
        }
622
79
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
79
        if (allpass_state->_pattern_matched) {
624
            if constexpr (LIKE_PATTERN) {
625
                allpass_state->like_pattern_match(pattern_str);
626
32
            } else {
627
32
                allpass_state->regexp_pattern_match(pattern_str);
628
32
            }
629
32
        }
630
79
        if (equal_state->_pattern_matched) {
631
            if constexpr (LIKE_PATTERN) {
632
                equal_state->like_pattern_match(pattern_str);
633
32
            } else {
634
32
                equal_state->regexp_pattern_match(pattern_str);
635
32
            }
636
32
        }
637
79
        if (substring_state->_pattern_matched) {
638
            if constexpr (LIKE_PATTERN) {
639
                substring_state->like_pattern_match(pattern_str);
640
55
            } else {
641
55
                substring_state->regexp_pattern_match(pattern_str);
642
55
            }
643
55
        }
644
79
        if (starts_with_state->_pattern_matched) {
645
            if constexpr (LIKE_PATTERN) {
646
                starts_with_state->like_pattern_match(pattern_str);
647
50
            } else {
648
50
                starts_with_state->regexp_pattern_match(pattern_str);
649
50
            }
650
50
        }
651
79
        if (ends_with_state->_pattern_matched) {
652
            if constexpr (LIKE_PATTERN) {
653
                ends_with_state->like_pattern_match(pattern_str);
654
38
            } else {
655
38
                ends_with_state->regexp_pattern_match(pattern_str);
656
38
            }
657
38
        }
658
79
    }
659
660
32
    if (allpass_state->_pattern_matched) {
661
0
        return allpass_state;
662
32
    } else if (equal_state->_pattern_matched) {
663
0
        return equal_state;
664
32
    } else if (substring_state->_pattern_matched) {
665
11
        return substring_state;
666
21
    } else if (starts_with_state->_pattern_matched) {
667
5
        return starts_with_state;
668
16
    } else if (ends_with_state->_pattern_matched) {
669
3
        return ends_with_state;
670
13
    } else {
671
13
        return nullptr;
672
13
    }
673
32
}
674
675
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
676
                                          ColumnUInt8::Container& result, LikeState* state,
677
452
                                          size_t input_rows_count) const {
678
452
    ColumnString::MutablePtr replaced_patterns;
679
452
    VPatternSearchStateSPtr vector_search_state;
680
452
    if (state->is_like_pattern) {
681
420
        if (state->has_custom_escape) {
682
5
            replaced_patterns = ColumnString::create();
683
10
            for (int i = 0; i < input_rows_count; ++i) {
684
5
                std::string val =
685
5
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
686
5
                replaced_patterns->insert_data(val.c_str(), val.size());
687
5
            }
688
5
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
689
415
        } else {
690
415
            vector_search_state = pattern_type_recognition<true>(patterns);
691
415
        }
692
420
    } else {
693
32
        vector_search_state = pattern_type_recognition<false>(patterns);
694
32
    }
695
696
452
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
697
698
452
    if (vector_search_state == nullptr) {
699
        // pattern type recognition failed, use default case
700
548
        for (int i = 0; i < input_rows_count; ++i) {
701
357
            const auto pattern_val = real_pattern.get_data_at(i);
702
357
            const auto value_val = values.get_data_at(i);
703
357
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
704
357
                                                     &result[i]));
705
357
        }
706
191
        return Status::OK();
707
191
    }
708
261
    const auto* search_strings =
709
261
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
710
261
    return (vector_search_state->_vector_function)(values, *search_strings, result);
711
452
}
712
713
Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val,
714
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
715
0
    std::string re_pattern;
716
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
717
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
718
0
}
719
720
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val,
721
243
                                    const StringRef& pattern, unsigned char* result) {
722
    // Try to use fast path to avoid regex compilation
723
243
    std::string search_string;
724
243
    LikeFastPath fast_path = extract_like_fast_path(pattern.data, pattern.size, search_string);
725
726
243
    switch (fast_path) {
727
12
    case LikeFastPath::ALLPASS:
728
12
        *result = 1;
729
12
        return Status::OK();
730
61
    case LikeFastPath::EQUALS:
731
61
        *result = (val.size == search_string.size() &&
732
61
                   (search_string.empty() ||
733
33
                    memcmp(val.data, search_string.data(), search_string.size()) == 0));
734
61
        return Status::OK();
735
22
    case LikeFastPath::STARTS_WITH:
736
22
        *result = (val.size >= search_string.size() &&
737
22
                   memcmp(val.data, search_string.data(), search_string.size()) == 0);
738
22
        return Status::OK();
739
16
    case LikeFastPath::ENDS_WITH:
740
16
        *result = (val.size >= search_string.size() &&
741
16
                   memcmp(val.data + val.size - search_string.size(), search_string.data(),
742
15
                          search_string.size()) == 0);
743
16
        return Status::OK();
744
34
    case LikeFastPath::SUBSTRING:
745
34
        if (search_string.empty()) {
746
0
            *result = 1;
747
34
        } else {
748
            // Use memmem for substring search
749
34
            *result = (memmem(val.data, val.size, search_string.data(), search_string.size()) !=
750
34
                       nullptr);
751
34
        }
752
34
        return Status::OK();
753
98
    case LikeFastPath::REGEX:
754
98
    default:
755
        // Fall back to regex matching
756
98
        std::string re_pattern;
757
98
        convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
758
98
        return regexp_fn_scalar(state, StringRef(val.data, val.size),
759
98
                                {re_pattern.c_str(), re_pattern.size()}, result);
760
243
    }
761
243
}
762
763
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
764
1.90k
                                        std::string* re_pattern) {
765
1.90k
    re_pattern->clear();
766
767
1.90k
    if (pattern.empty()) {
768
0
        re_pattern->append("^$");
769
0
        return;
770
0
    }
771
772
    // add ^ to pattern head to match line head
773
1.90k
    if (!pattern.empty() && pattern[0] != '%') {
774
1.18k
        re_pattern->append("^");
775
1.18k
    }
776
777
    // expect % and _, all chars should keep it literal mean.
778
14.1k
    for (size_t i = 0; i < pattern.size(); i++) {
779
12.2k
        char c = pattern[i];
780
12.2k
        if (c == '\\' && i + 1 < pattern.size()) {
781
598
            char next_c = pattern[i + 1];
782
598
            if (next_c == '%' || next_c == '_') {
783
                // convert "\%" and "\_" to literal "%" and "_"
784
216
                re_pattern->append(1, next_c);
785
216
                i++;
786
216
                continue;
787
382
            } else if (next_c == '\\') {
788
                // keep valid escape "\\"
789
362
                re_pattern->append("\\\\");
790
362
                i++;
791
362
                continue;
792
362
            }
793
598
        }
794
795
11.7k
        if (c == '%') {
796
2.19k
            re_pattern->append(".*");
797
9.51k
        } else if (c == '_') {
798
1.48k
            re_pattern->append(".");
799
8.02k
        } else {
800
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
801
8.03k
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
802
8.02k
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
803
8.02k
                c == '.' || c == '$' || c == '?') {
804
118
                re_pattern->append(1, '\\');
805
118
            }
806
8.02k
            re_pattern->append(1, c);
807
8.02k
        }
808
11.7k
    }
809
810
    // add $ to pattern tail to match line tail
811
1.90k
    if (!pattern.empty() && re_pattern->back() != '*') {
812
755
        re_pattern->append("$");
813
755
    }
814
1.90k
}
815
816
4.21k
void FunctionLike::remove_escape_character(std::string* search_string) {
817
4.21k
    std::string tmp_search_string;
818
4.21k
    tmp_search_string.swap(*search_string);
819
4.21k
    int64_t len = tmp_search_string.length();
820
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
821
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
822
21.1k
    for (int i = 0; i < len;) {
823
16.9k
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
824
16.9k
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
825
140
             tmp_search_string[i + 1] == '\\')) {
826
132
            search_string->append(1, tmp_search_string[i + 1]);
827
132
            i += 2;
828
16.7k
        } else {
829
16.7k
            search_string->append(1, tmp_search_string[i]);
830
16.7k
            i++;
831
16.7k
        }
832
16.9k
    }
833
4.21k
}
834
835
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
836
0
    if (!re.ok()) {
837
0
        return false;
838
0
    }
839
840
0
    std::vector<RE2::Arg> arguments;
841
0
    std::vector<RE2::Arg*> arguments_ptrs;
842
0
    std::size_t args_count = re.NumberOfCapturingGroups();
843
0
    arguments.resize(args_count);
844
0
    arguments_ptrs.resize(args_count);
845
0
    results.resize(args_count);
846
0
    for (std::size_t i = 0; i < args_count; ++i) {
847
0
        arguments[i] = &results[i];
848
0
        arguments_ptrs[i] = &arguments[i];
849
0
    }
850
851
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
852
0
}
853
854
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
855
0
    std::vector<std::string> results;
856
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
857
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
858
0
    if (re2_full_match(str, re, results)) {
859
0
        for (int i = 0; i < results.size(); ++i) {
860
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
861
0
        }
862
0
    } else {
863
0
        VLOG_DEBUG << "no match";
864
0
    }
865
0
}
866
867
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
868
                                                std::shared_ptr<LikeState>& state,
869
5.26k
                                                bool try_hyperscan) {
870
5.26k
    std::string pattern_str;
871
5.26k
    if (state->has_custom_escape) {
872
11
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
873
5.25k
    } else {
874
5.25k
        pattern_str = pattern.to_string();
875
5.25k
    }
876
5.26k
    state->search_state.pattern_str = pattern_str;
877
5.26k
    std::string search_string;
878
879
5.26k
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
880
122
        state->search_state.set_search_string("");
881
122
        state->function = constant_allpass_fn;
882
122
        state->scalar_function = constant_allpass_fn_scalar;
883
5.14k
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
884
238
        if (VLOG_DEBUG_IS_ON) {
885
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
886
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
887
0
        }
888
238
        remove_escape_character(&search_string);
889
238
        if (VLOG_DEBUG_IS_ON) {
890
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
891
0
                       << ", size: " << search_string.size();
892
0
        }
893
238
        state->search_state.set_search_string(search_string);
894
238
        state->function = constant_equals_fn;
895
238
        state->scalar_function = constant_equals_fn_scalar;
896
4.90k
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
897
2.24k
        if (VLOG_DEBUG_IS_ON) {
898
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
899
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
900
0
        }
901
2.24k
        remove_escape_character(&search_string);
902
2.24k
        if (VLOG_DEBUG_IS_ON) {
903
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
904
0
                       << ", size: " << search_string.size();
905
0
        }
906
2.24k
        state->search_state.set_search_string(search_string);
907
2.24k
        state->function = constant_starts_with_fn;
908
2.24k
        state->scalar_function = constant_starts_with_fn_scalar;
909
2.65k
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
910
167
        if (VLOG_DEBUG_IS_ON) {
911
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
912
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
913
0
        }
914
167
        remove_escape_character(&search_string);
915
167
        if (VLOG_DEBUG_IS_ON) {
916
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
917
0
                       << ", size: " << search_string.size();
918
0
        }
919
167
        state->search_state.set_search_string(search_string);
920
167
        state->function = constant_ends_with_fn;
921
167
        state->scalar_function = constant_ends_with_fn_scalar;
922
2.49k
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
923
1.27k
        if (VLOG_DEBUG_IS_ON) {
924
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
925
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
926
0
        }
927
1.27k
        remove_escape_character(&search_string);
928
1.27k
        if (VLOG_DEBUG_IS_ON) {
929
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
930
0
                       << ", size: " << search_string.size();
931
0
        }
932
1.27k
        state->search_state.set_search_string(search_string);
933
1.27k
        state->function = constant_substring_fn;
934
1.27k
        state->scalar_function = constant_substring_fn_scalar;
935
1.27k
    } else {
936
1.21k
        std::string re_pattern;
937
1.21k
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
938
1.21k
        if (VLOG_DEBUG_IS_ON) {
939
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
940
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
941
0
                       << ", size: " << re_pattern.size();
942
0
        }
943
944
1.21k
        hs_database_t* database = nullptr;
945
1.21k
        hs_scratch_t* scratch = nullptr;
946
1.21k
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
947
            // use hyperscan
948
1.12k
            state->search_state.hs_database.reset(database);
949
1.12k
            state->search_state.hs_scratch.reset(scratch);
950
1.12k
        } else {
951
            // fallback to re2
952
            // reset hs_database to nullptr to indicate not use hyperscan
953
90
            state->search_state.hs_database.reset();
954
90
            state->search_state.hs_scratch.reset();
955
956
90
            RE2::Options opts;
957
90
            opts.set_never_nl(false);
958
90
            opts.set_dot_nl(true);
959
90
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
960
90
            if (!state->search_state.regex->ok()) {
961
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
962
0
                                             pattern_str);
963
0
            }
964
90
        }
965
966
1.21k
        state->function = constant_regex_fn;
967
1.21k
        state->scalar_function = constant_regex_fn_scalar;
968
1.21k
    }
969
5.26k
    return Status::OK();
970
5.26k
}
971
972
6.64k
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
973
6.64k
    if (scope != FunctionContext::THREAD_LOCAL) {
974
1.24k
        return Status::OK();
975
1.24k
    }
976
5.40k
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
977
5.40k
    state->is_like_pattern = true;
978
5.40k
    state->function = like_fn;
979
5.40k
    state->scalar_function = like_fn_scalar;
980
5.40k
    if (context->is_col_constant(2)) {
981
11
        state->has_custom_escape = true;
982
11
        const auto escape_col = context->get_constant_col(2)->column_ptr;
983
11
        const auto& escape = escape_col->get_data_at(0);
984
11
        if (escape.size != 1) {
985
0
            return Status::InternalError("Escape character must be a single character, got: {}",
986
0
                                         escape.to_string());
987
0
        }
988
11
        state->escape_char = escape.data[0];
989
11
    }
990
5.40k
    if (context->is_col_constant(1)) {
991
5.09k
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
992
5.09k
        const auto& pattern = pattern_col->get_data_at(0);
993
5.09k
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
994
5.09k
    }
995
5.40k
    context->set_function_state(scope, state);
996
997
5.40k
    return Status::OK();
998
5.40k
}
999
1000
Status FunctionRegexpLike::open(FunctionContext* context,
1001
658
                                FunctionContext::FunctionStateScope scope) {
1002
658
    if (scope != FunctionContext::THREAD_LOCAL) {
1003
138
        return Status::OK();
1004
138
    }
1005
520
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
1006
520
    context->set_function_state(scope, state);
1007
520
    state->is_like_pattern = false;
1008
520
    state->function = regexp_fn;
1009
520
    state->scalar_function = regexp_fn_scalar;
1010
520
    if (context->is_col_constant(1)) {
1011
463
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
1012
463
        const auto& pattern = pattern_col->get_data_at(0);
1013
1014
463
        std::string pattern_str = pattern.to_string();
1015
463
        std::string search_string;
1016
463
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
1017
16
            state->search_state.set_search_string("");
1018
16
            state->function = constant_allpass_fn;
1019
16
            state->scalar_function = constant_allpass_fn_scalar;
1020
447
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
1021
7
            state->search_state.set_search_string(search_string);
1022
7
            state->function = constant_equals_fn;
1023
7
            state->scalar_function = constant_equals_fn_scalar;
1024
440
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
1025
95
            state->search_state.set_search_string(search_string);
1026
95
            state->function = constant_starts_with_fn;
1027
95
            state->scalar_function = constant_starts_with_fn_scalar;
1028
345
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
1029
79
            state->search_state.set_search_string(search_string);
1030
79
            state->function = constant_ends_with_fn;
1031
79
            state->scalar_function = constant_ends_with_fn_scalar;
1032
266
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
1033
51
            state->search_state.set_search_string(search_string);
1034
51
            state->function = constant_substring_fn;
1035
51
            state->scalar_function = constant_substring_fn_scalar;
1036
215
        } else {
1037
215
            hs_database_t* database = nullptr;
1038
215
            hs_scratch_t* scratch = nullptr;
1039
215
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
1040
                // use hyperscan
1041
195
                state->search_state.hs_database.reset(database);
1042
195
                state->search_state.hs_scratch.reset(scratch);
1043
195
            } else {
1044
                // fallback to re2
1045
                // reset hs_database to nullptr to indicate not use hyperscan
1046
20
                state->search_state.hs_database.reset();
1047
20
                state->search_state.hs_scratch.reset();
1048
20
                RE2::Options opts;
1049
20
                opts.set_never_nl(false);
1050
20
                opts.set_dot_nl(true);
1051
20
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1052
20
                if (!state->search_state.regex->ok()) {
1053
9
                    if (!context->state()->enable_extended_regex()) {
1054
1
                        return Status::InternalError(
1055
1
                                "Invalid regex expression: {}. Error: {}. If you need advanced "
1056
1
                                "regex features, try setting enable_extended_regex=true",
1057
1
                                pattern_str, state->search_state.regex->error());
1058
1
                    }
1059
1060
                    // RE2 failed, fallback to Boost.Regex
1061
                    // This handles advanced regex features like zero-width assertions
1062
8
                    state->search_state.regex.reset();
1063
8
                    try {
1064
8
                        state->search_state.boost_regex =
1065
8
                                std::make_unique<boost::regex>(pattern_str);
1066
8
                    } catch (const boost::regex_error& e) {
1067
0
                        return Status::InternalError("Invalid regex expression: {}. Error: {}",
1068
0
                                                     pattern_str, e.what());
1069
0
                    }
1070
8
                }
1071
20
            }
1072
214
            state->function = constant_regex_fn;
1073
214
            state->scalar_function = constant_regex_fn_scalar;
1074
214
        }
1075
463
    }
1076
519
    return Status::OK();
1077
520
}
1078
1079
8
void register_function_like(SimpleFunctionFactory& factory) {
1080
8
    factory.register_function<FunctionLike>();
1081
8
}
1082
1083
8
void register_function_regexp(SimpleFunctionFactory& factory) {
1084
8
    factory.register_function<FunctionRegexpLike>();
1085
8
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1086
8
}
1087
#include "common/compile_check_end.h"
1088
} // namespace doris