Coverage Report

Created: 2026-03-19 18:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "core/block/block.h"
31
#include "core/block/column_with_type_and_name.h"
32
#include "core/column/column.h"
33
#include "core/column/column_const.h"
34
#include "core/column/column_vector.h"
35
#include "core/string_ref.h"
36
#include "exprs/function/simple_function_factory.h"
37
38
namespace doris {
39
#include "common/compile_check_begin.h"
40
// A regex to match any regex pattern is equivalent to a substring search.
41
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
42
43
// A regex to match any regex pattern which is equivalent to matching a constant string
44
// at the end of the string values.
45
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
46
47
// A regex to match any regex pattern which is equivalent to matching a constant string
48
// at the end of the string values.
49
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
50
51
// A regex to match any regex pattern which is equivalent to a constant string match.
52
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
53
// A regex to match .*
54
static const RE2 ALLPASS_RE(R"((\.\*)+)");
55
56
// Like patterns
57
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
58
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
59
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
60
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
61
static const re2::RE2 LIKE_ALLPASS_RE("%+");
62
63
struct VectorAllpassSearchState : public VectorPatternSearchState {
64
614
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
65
66
614
    ~VectorAllpassSearchState() override = default;
67
68
624
    void like_pattern_match(const std::string& pattern_str) override {
69
624
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
70
52
            _search_strings->insert_default();
71
572
        } else {
72
572
            _pattern_matched = false;
73
572
        }
74
624
    }
75
76
0
    void regexp_pattern_match(const std::string& pattern_str) override {
77
0
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
78
0
            _search_strings->insert_default();
79
0
        } else {
80
0
            _pattern_matched = false;
81
0
        }
82
0
    }
83
};
84
85
struct VectorEqualSearchState : public VectorPatternSearchState {
86
614
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
87
88
614
    ~VectorEqualSearchState() override = default;
89
90
630
    void like_pattern_match(const std::string& pattern_str) override {
91
630
        _search_string.clear();
92
630
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
93
172
            FunctionLike::remove_escape_character(&_search_string);
94
172
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
95
458
        } else {
96
458
            _pattern_matched = false;
97
458
        }
98
630
    }
99
100
0
    void regexp_pattern_match(const std::string& pattern_str) override {
101
0
        _search_string.clear();
102
0
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
103
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
104
0
        } else {
105
0
            _pattern_matched = false;
106
0
        }
107
0
    }
108
};
109
110
struct VectorSubStringSearchState : public VectorPatternSearchState {
111
    VectorSubStringSearchState()
112
614
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
113
114
614
    ~VectorSubStringSearchState() override = default;
115
116
642
    void like_pattern_match(const std::string& pattern_str) override {
117
642
        _search_string.clear();
118
642
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
119
82
            FunctionLike::remove_escape_character(&_search_string);
120
82
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
121
560
        } else {
122
560
            _pattern_matched = false;
123
560
        }
124
642
    }
125
126
0
    void regexp_pattern_match(const std::string& pattern_str) override {
127
0
        _search_string.clear();
128
0
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
129
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
130
0
        } else {
131
0
            _pattern_matched = false;
132
0
        }
133
0
    }
134
};
135
136
struct VectorStartsWithSearchState : public VectorPatternSearchState {
137
    VectorStartsWithSearchState()
138
614
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
139
140
614
    ~VectorStartsWithSearchState() override = default;
141
142
626
    void like_pattern_match(const std::string& pattern_str) override {
143
626
        _search_string.clear();
144
626
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
145
74
            FunctionLike::remove_escape_character(&_search_string);
146
74
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
147
552
        } else {
148
552
            _pattern_matched = false;
149
552
        }
150
626
    }
151
152
0
    void regexp_pattern_match(const std::string& pattern_str) override {
153
0
        _search_string.clear();
154
0
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
155
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
156
0
        } else {
157
0
            _pattern_matched = false;
158
0
        }
159
0
    }
160
};
161
162
struct VectorEndsWithSearchState : public VectorPatternSearchState {
163
614
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
164
165
614
    ~VectorEndsWithSearchState() override = default;
166
167
624
    void like_pattern_match(const std::string& pattern_str) override {
168
624
        _search_string.clear();
169
624
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
170
72
            FunctionLike::remove_escape_character(&_search_string);
171
72
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
172
552
        } else {
173
552
            _pattern_matched = false;
174
552
        }
175
624
    }
176
177
0
    void regexp_pattern_match(const std::string& pattern_str) override {
178
0
        _search_string.clear();
179
0
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
180
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
181
0
        } else {
182
0
            _pattern_matched = false;
183
0
        }
184
0
    }
185
};
186
187
0
Status LikeSearchState::clone(LikeSearchState& cloned) {
188
0
    cloned.set_search_string(search_string);
189
190
0
    std::string re_pattern;
191
0
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
192
0
    if (hs_database) { // use hyperscan
193
0
        hs_database_t* database = nullptr;
194
0
        hs_scratch_t* scratch = nullptr;
195
0
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
196
197
0
        cloned.hs_database.reset(database);
198
0
        cloned.hs_scratch.reset(scratch);
199
0
    } else { // fallback to re2
200
0
        cloned.hs_database.reset();
201
0
        cloned.hs_scratch.reset();
202
203
0
        RE2::Options opts;
204
0
        opts.set_never_nl(false);
205
0
        opts.set_dot_nl(true);
206
0
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
207
0
        if (!cloned.regex->ok()) {
208
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
209
0
        }
210
0
    }
211
212
0
    return Status::OK();
213
0
}
214
215
Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals,
216
                                             const StringRef& pattern,
217
28
                                             ColumnUInt8::Container& result) {
218
28
    memset(result.data(), 1, vals.size());
219
28
    return Status::OK();
220
28
}
221
222
Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state,
223
                                                    const StringRef& val, const StringRef& pattern,
224
0
                                                    unsigned char* result) {
225
0
    *result = 1;
226
0
    return Status::OK();
227
0
}
228
229
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
230
                                           const ColumnString& search_strings,
231
42
                                           ColumnUInt8::Container& result) {
232
42
    DCHECK(vals.size() == search_strings.size());
233
42
    DCHECK(vals.size() == result.size());
234
42
    memset(result.data(), 1, vals.size());
235
42
    return Status::OK();
236
42
}
237
238
Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state,
239
                                                 const ColumnString& val, const StringRef& pattern,
240
44
                                                 ColumnUInt8::Container& result) {
241
44
    auto sz = val.size();
242
88
    for (size_t i = 0; i < sz; i++) {
243
44
        const auto& str_ref = val.get_data_at(i);
244
44
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
245
44
                    str_ref.start_with(state->search_string_sv);
246
44
    }
247
44
    return Status::OK();
248
44
}
249
250
Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state,
251
                                                        const StringRef& val,
252
                                                        const StringRef& pattern,
253
0
                                                        unsigned char* result) {
254
0
    *result = (val.size >= state->search_string_sv.size) &&
255
0
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
256
0
    return Status::OK();
257
0
}
258
259
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
260
                                               const ColumnString& search_strings,
261
62
                                               ColumnUInt8::Container& result) {
262
62
    DCHECK(vals.size() == search_strings.size());
263
62
    DCHECK(vals.size() == result.size());
264
62
    auto sz = vals.size();
265
134
    for (size_t i = 0; i < sz; ++i) {
266
72
        const auto& str_sv = vals.get_data_at(i);
267
72
        const auto& search_string_sv = search_strings.get_data_at(i);
268
72
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
269
72
    }
270
62
    return Status::OK();
271
62
}
272
273
Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state,
274
                                               const ColumnString& val, const StringRef& pattern,
275
44
                                               ColumnUInt8::Container& result) {
276
44
    auto sz = val.size();
277
88
    for (size_t i = 0; i < sz; i++) {
278
44
        const auto& str_ref = val.get_data_at(i);
279
44
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
280
44
                    str_ref.end_with(state->search_string_sv);
281
44
    }
282
44
    return Status::OK();
283
44
}
284
285
Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state,
286
                                                      const StringRef& val,
287
                                                      const StringRef& pattern,
288
0
                                                      unsigned char* result) {
289
0
    *result = (val.size >= state->search_string_sv.size) &&
290
0
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
291
0
                                                        state->search_string_sv.size));
292
0
    return Status::OK();
293
0
}
294
295
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
296
                                             const ColumnString& search_strings,
297
62
                                             ColumnUInt8::Container& result) {
298
62
    DCHECK(vals.size() == search_strings.size());
299
62
    DCHECK(vals.size() == result.size());
300
62
    auto sz = vals.size();
301
134
    for (size_t i = 0; i < sz; ++i) {
302
72
        const auto& str_sv = vals.get_data_at(i);
303
72
        const auto& search_string_sv = search_strings.get_data_at(i);
304
72
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
305
72
    }
306
62
    return Status::OK();
307
62
}
308
309
Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
310
                                            const StringRef& pattern,
311
94
                                            ColumnUInt8::Container& result) {
312
94
    auto sz = val.size();
313
188
    for (size_t i = 0; i < sz; i++) {
314
94
        result[i] = (val.get_data_at(i) == state->search_string_sv);
315
94
    }
316
94
    return Status::OK();
317
94
}
318
319
Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state,
320
                                                   const StringRef& val, const StringRef& pattern,
321
0
                                                   unsigned char* result) {
322
0
    *result = (val == state->search_string_sv);
323
0
    return Status::OK();
324
0
}
325
326
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
327
                                          const ColumnString& search_strings,
328
156
                                          ColumnUInt8::Container& result) {
329
156
    DCHECK(vals.size() == search_strings.size());
330
156
    DCHECK(vals.size() == result.size());
331
156
    auto sz = vals.size();
332
312
    for (size_t i = 0; i < sz; ++i) {
333
156
        const auto& str_sv = vals.get_data_at(i);
334
156
        const auto& search_string_sv = search_strings.get_data_at(i);
335
156
        result[i] = str_sv == search_string_sv;
336
156
    }
337
156
    return Status::OK();
338
156
}
339
340
Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state,
341
                                               const ColumnString& val, const StringRef& pattern,
342
0
                                               ColumnUInt8::Container& result) {
343
0
    auto sz = val.size();
344
0
    for (size_t i = 0; i < sz; i++) {
345
0
        if (state->search_string_sv.size == 0) {
346
0
            result[i] = true;
347
0
            continue;
348
0
        }
349
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
350
0
    }
351
0
    return Status::OK();
352
0
}
353
354
Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state,
355
                                                      const StringRef& val,
356
                                                      const StringRef& pattern,
357
0
                                                      unsigned char* result) {
358
0
    if (state->search_string_sv.size == 0) {
359
0
        *result = true;
360
0
        return Status::OK();
361
0
    }
362
0
    *result = state->substring_pattern.search(val) != -1;
363
0
    return Status::OK();
364
0
}
365
366
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
367
                                             const ColumnString& search_strings,
368
54
                                             ColumnUInt8::Container& result) {
369
54
    DCHECK(vals.size() == search_strings.size());
370
54
    DCHECK(vals.size() == result.size());
371
54
    auto sz = vals.size();
372
108
    for (size_t i = 0; i < sz; ++i) {
373
54
        const auto& str_sv = vals.get_data_at(i);
374
54
        const auto& search_string_sv = search_strings.get_data_at(i);
375
54
        if (search_string_sv.size == 0) {
376
0
            result[i] = true;
377
0
            continue;
378
0
        }
379
54
        doris::StringSearch substring_search(&search_string_sv);
380
54
        result[i] = substring_search.search(str_sv) != -1;
381
54
    }
382
54
    return Status::OK();
383
54
}
384
385
Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state,
386
                                                  const StringRef& val, const StringRef& pattern,
387
106
                                                  unsigned char* result) {
388
106
    if (state->hs_database) { // use hyperscan
389
106
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
390
106
                           state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
391
106
                           (void*)result);
392
106
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
393
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
394
0
        }
395
106
    } else if (state->boost_regex) { // use boost::regex for advanced features
396
0
        *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex);
397
0
    } else { // fallback to re2
398
0
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
399
0
    }
400
401
106
    return Status::OK();
402
106
}
403
404
Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
405
180
                                          const StringRef& pattern, unsigned char* result) {
406
180
    RE2::Options opts;
407
180
    opts.set_never_nl(false);
408
180
    opts.set_dot_nl(true);
409
180
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
410
180
    if (re.ok()) {
411
180
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
412
180
    } else {
413
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
414
0
    }
415
416
180
    return Status::OK();
417
180
}
418
419
Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
420
                                           const StringRef& pattern,
421
150
                                           ColumnUInt8::Container& result) {
422
150
    auto sz = val.size();
423
150
    if (state->hs_database) { // use hyperscan
424
300
        for (size_t i = 0; i < sz; i++) {
425
150
            const auto& str_ref = val.get_data_at(i);
426
150
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
427
150
                               state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
428
150
                               (void*)(result.data() + i));
429
150
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
430
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
431
0
            }
432
150
        }
433
150
    } else if (state->boost_regex) { // use boost::regex for advanced features
434
0
        for (size_t i = 0; i < sz; i++) {
435
0
            const auto& str_ref = val.get_data_at(i);
436
0
            *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size,
437
0
                                                       *state->boost_regex);
438
0
        }
439
0
    } else { // fallback to re2
440
0
        for (size_t i = 0; i < sz; i++) {
441
0
            const auto& str_ref = val.get_data_at(i);
442
0
            *(result.data() + i) =
443
0
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
444
0
        }
445
0
    }
446
447
150
    return Status::OK();
448
150
}
449
450
Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val,
451
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
452
0
    std::string re_pattern(pattern.data, pattern.size);
453
454
0
    hs_database_t* database = nullptr;
455
0
    hs_scratch_t* scratch = nullptr;
456
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
457
0
        auto sz = val.size();
458
0
        for (size_t i = 0; i < sz; i++) {
459
0
            const auto& str_ref = val.get_data_at(i);
460
0
            auto ret =
461
0
                    hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
462
0
                            doris::LikeSearchState::hs_match_handler, (void*)(result.data() + i));
463
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
464
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
465
0
            }
466
0
        }
467
468
0
        hs_free_scratch(scratch);
469
0
        hs_free_database(database);
470
0
    } else { // fallback to re2
471
0
        RE2::Options opts;
472
0
        opts.set_never_nl(false);
473
0
        opts.set_dot_nl(true);
474
0
        re2::RE2 re(re_pattern, opts);
475
0
        if (re.ok()) {
476
0
            auto sz = val.size();
477
0
            for (size_t i = 0; i < sz; i++) {
478
0
                const auto& str_ref = val.get_data_at(i);
479
0
                *(result.data() + i) =
480
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
481
0
            }
482
0
        } else {
483
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
484
0
        }
485
0
    }
486
487
0
    return Status::OK();
488
0
}
489
490
// hyperscan compile expression to database and allocate scratch space
491
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
492
264
                                    hs_database_t** database, hs_scratch_t** scratch) {
493
264
    hs_compile_error_t* compile_err;
494
264
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
495
264
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
496
497
264
    if (res != HS_SUCCESS) {
498
0
        *database = nullptr;
499
0
        std::string error_message = compile_err->message;
500
0
        hs_free_compile_error(compile_err);
501
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
502
0
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
503
0
    }
504
264
    hs_free_compile_error(compile_err);
505
506
264
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
507
0
        hs_free_database(*database);
508
0
        *database = nullptr;
509
0
        *scratch = nullptr;
510
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
511
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
512
0
    }
513
514
264
    return Status::OK();
515
264
}
516
517
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
518
                                      const ColumnNumbers& arguments, uint32_t result,
519
1.10k
                                      size_t input_rows_count) const {
520
1.10k
    const auto values_col =
521
1.10k
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
522
1.10k
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
523
524
1.10k
    if (!values) {
525
0
        return Status::InternalError("Not supported input arguments types");
526
0
    }
527
    // result column
528
1.10k
    auto res = ColumnUInt8::create();
529
1.10k
    ColumnUInt8::Container& vec_res = res->get_data();
530
    // set default value to 0, and match functions only need to set 1/true
531
1.10k
    vec_res.resize_fill(input_rows_count);
532
1.10k
    auto* state = reinterpret_cast<LikeState*>(
533
1.10k
            context->get_function_state(FunctionContext::THREAD_LOCAL));
534
    // for constant_substring_fn, use long run length search for performance
535
1.10k
    if (constant_substring_fn ==
536
1.10k
        *(state->function
537
1.10k
                  .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&,
538
1.10k
                                            const StringRef&, ColumnUInt8::Container&)>())) {
539
132
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
540
132
                                          &state->search_state));
541
974
    } else {
542
974
        const auto pattern_col = block.get_by_position(arguments[1]).column;
543
974
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
544
614
            RETURN_IF_ERROR(
545
614
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
546
614
        } else if (const auto* const_patterns =
547
360
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
548
360
            const auto& pattern_val = const_patterns->get_data_at(0);
549
360
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
550
360
                                         &state->search_state));
551
360
        } else {
552
0
            return Status::InternalError("Not supported input arguments types");
553
0
        }
554
974
    }
555
1.10k
    block.replace_by_position(result, std::move(res));
556
1.10k
    return Status::OK();
557
1.10k
}
558
559
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
560
                                           const ColumnString::Offsets& value_offsets,
561
                                           ColumnUInt8::Container& result,
562
132
                                           LikeSearchState* search_state) const {
563
    // treat continuous multi string data as a long string data
564
132
    const UInt8* begin = values.data();
565
132
    const UInt8* end = begin + values.size();
566
132
    const UInt8* pos = begin;
567
568
    /// Current index in the array of strings.
569
132
    size_t i = 0;
570
132
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
571
572
    /// We will search for the next occurrence in all strings at once.
573
216
    while (pos < end) {
574
        // search return matched substring start offset
575
128
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
576
128
        if (pos >= end) {
577
44
            break;
578
44
        }
579
580
        /// Determine which index it refers to.
581
        /// begin + value_offsets[i] is the start offset of string at i+1
582
84
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
583
0
            ++i;
584
0
        }
585
586
        /// We check that the entry does not pass through the boundaries of strings.
587
84
        if (pos + needle_size <= begin + value_offsets[i]) {
588
84
            result[i] = 1;
589
84
        }
590
591
        // move to next string offset
592
84
        pos = begin + value_offsets[i];
593
84
        ++i;
594
84
    }
595
596
132
    return Status::OK();
597
132
}
598
599
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
600
                                      ColumnUInt8::Container& result, const LikeFn& function,
601
360
                                      LikeSearchState* search_state) const {
602
360
    RETURN_IF_ERROR((function)(search_state, values,
603
360
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
604
360
    return Status::OK();
605
360
}
606
607
template <bool LIKE_PATTERN>
608
614
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
614
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
614
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
614
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
614
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
614
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
614
    size_t size = patterns.size();
615
616
1.30k
    for (size_t i = 0; i < size; ++i) {
617
714
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
714
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
714
            !ends_with_state->_pattern_matched) {
620
24
            return nullptr;
621
24
        }
622
690
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
690
        if (allpass_state->_pattern_matched) {
624
624
            if constexpr (LIKE_PATTERN) {
625
624
                allpass_state->like_pattern_match(pattern_str);
626
624
            } else {
627
0
                allpass_state->regexp_pattern_match(pattern_str);
628
0
            }
629
624
        }
630
690
        if (equal_state->_pattern_matched) {
631
630
            if constexpr (LIKE_PATTERN) {
632
630
                equal_state->like_pattern_match(pattern_str);
633
630
            } else {
634
0
                equal_state->regexp_pattern_match(pattern_str);
635
0
            }
636
630
        }
637
690
        if (substring_state->_pattern_matched) {
638
642
            if constexpr (LIKE_PATTERN) {
639
642
                substring_state->like_pattern_match(pattern_str);
640
642
            } else {
641
0
                substring_state->regexp_pattern_match(pattern_str);
642
0
            }
643
642
        }
644
690
        if (starts_with_state->_pattern_matched) {
645
626
            if constexpr (LIKE_PATTERN) {
646
626
                starts_with_state->like_pattern_match(pattern_str);
647
626
            } else {
648
0
                starts_with_state->regexp_pattern_match(pattern_str);
649
0
            }
650
626
        }
651
690
        if (ends_with_state->_pattern_matched) {
652
624
            if constexpr (LIKE_PATTERN) {
653
624
                ends_with_state->like_pattern_match(pattern_str);
654
624
            } else {
655
0
                ends_with_state->regexp_pattern_match(pattern_str);
656
0
            }
657
624
        }
658
690
    }
659
660
590
    if (allpass_state->_pattern_matched) {
661
42
        return allpass_state;
662
548
    } else if (equal_state->_pattern_matched) {
663
156
        return equal_state;
664
392
    } else if (substring_state->_pattern_matched) {
665
54
        return substring_state;
666
338
    } else if (starts_with_state->_pattern_matched) {
667
62
        return starts_with_state;
668
276
    } else if (ends_with_state->_pattern_matched) {
669
62
        return ends_with_state;
670
214
    } else {
671
214
        return nullptr;
672
214
    }
673
590
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
608
614
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
614
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
614
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
614
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
614
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
614
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
614
    size_t size = patterns.size();
615
616
1.30k
    for (size_t i = 0; i < size; ++i) {
617
714
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
714
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
714
            !ends_with_state->_pattern_matched) {
620
24
            return nullptr;
621
24
        }
622
690
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
690
        if (allpass_state->_pattern_matched) {
624
624
            if constexpr (LIKE_PATTERN) {
625
624
                allpass_state->like_pattern_match(pattern_str);
626
            } else {
627
                allpass_state->regexp_pattern_match(pattern_str);
628
            }
629
624
        }
630
690
        if (equal_state->_pattern_matched) {
631
630
            if constexpr (LIKE_PATTERN) {
632
630
                equal_state->like_pattern_match(pattern_str);
633
            } else {
634
                equal_state->regexp_pattern_match(pattern_str);
635
            }
636
630
        }
637
690
        if (substring_state->_pattern_matched) {
638
642
            if constexpr (LIKE_PATTERN) {
639
642
                substring_state->like_pattern_match(pattern_str);
640
            } else {
641
                substring_state->regexp_pattern_match(pattern_str);
642
            }
643
642
        }
644
690
        if (starts_with_state->_pattern_matched) {
645
626
            if constexpr (LIKE_PATTERN) {
646
626
                starts_with_state->like_pattern_match(pattern_str);
647
            } else {
648
                starts_with_state->regexp_pattern_match(pattern_str);
649
            }
650
626
        }
651
690
        if (ends_with_state->_pattern_matched) {
652
624
            if constexpr (LIKE_PATTERN) {
653
624
                ends_with_state->like_pattern_match(pattern_str);
654
            } else {
655
                ends_with_state->regexp_pattern_match(pattern_str);
656
            }
657
624
        }
658
690
    }
659
660
590
    if (allpass_state->_pattern_matched) {
661
42
        return allpass_state;
662
548
    } else if (equal_state->_pattern_matched) {
663
156
        return equal_state;
664
392
    } else if (substring_state->_pattern_matched) {
665
54
        return substring_state;
666
338
    } else if (starts_with_state->_pattern_matched) {
667
62
        return starts_with_state;
668
276
    } else if (ends_with_state->_pattern_matched) {
669
62
        return ends_with_state;
670
214
    } else {
671
214
        return nullptr;
672
214
    }
673
590
}
Unexecuted instantiation: _ZN5doris16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
674
675
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
676
                                          ColumnUInt8::Container& result, LikeState* state,
677
614
                                          size_t input_rows_count) const {
678
614
    ColumnString::MutablePtr replaced_patterns;
679
614
    VPatternSearchStateSPtr vector_search_state;
680
614
    if (state->is_like_pattern) {
681
614
        if (state->has_custom_escape) {
682
0
            replaced_patterns = ColumnString::create();
683
0
            for (int i = 0; i < input_rows_count; ++i) {
684
0
                std::string val =
685
0
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
686
0
                replaced_patterns->insert_data(val.c_str(), val.size());
687
0
            }
688
0
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
689
614
        } else {
690
614
            vector_search_state = pattern_type_recognition<true>(patterns);
691
614
        }
692
614
    } else {
693
0
        vector_search_state = pattern_type_recognition<false>(patterns);
694
0
    }
695
696
614
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
697
698
614
    if (vector_search_state == nullptr) {
699
        // pattern type recognition failed, use default case
700
762
        for (int i = 0; i < input_rows_count; ++i) {
701
524
            const auto pattern_val = real_pattern.get_data_at(i);
702
524
            const auto value_val = values.get_data_at(i);
703
524
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
704
524
                                                     &result[i]));
705
524
        }
706
238
        return Status::OK();
707
238
    }
708
376
    const auto* search_strings =
709
376
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
710
376
    return (vector_search_state->_vector_function)(values, *search_strings, result);
711
614
}
712
713
Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val,
714
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
715
0
    std::string re_pattern;
716
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
717
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
718
0
}
719
720
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val,
721
418
                                    const StringRef& pattern, unsigned char* result) {
722
    // Try to use fast path to avoid regex compilation
723
418
    std::string search_string;
724
418
    LikeFastPath fast_path = extract_like_fast_path(pattern.data, pattern.size, search_string);
725
726
418
    switch (fast_path) {
727
12
    case LikeFastPath::ALLPASS:
728
12
        *result = 1;
729
12
        return Status::OK();
730
118
    case LikeFastPath::EQUALS:
731
118
        *result = (val.size == search_string.size() &&
732
118
                   (search_string.empty() ||
733
66
                    memcmp(val.data, search_string.data(), search_string.size()) == 0));
734
118
        return Status::OK();
735
32
    case LikeFastPath::STARTS_WITH:
736
32
        *result = (val.size >= search_string.size() &&
737
32
                   memcmp(val.data, search_string.data(), search_string.size()) == 0);
738
32
        return Status::OK();
739
20
    case LikeFastPath::ENDS_WITH:
740
20
        *result = (val.size >= search_string.size() &&
741
20
                   memcmp(val.data + val.size - search_string.size(), search_string.data(),
742
18
                          search_string.size()) == 0);
743
20
        return Status::OK();
744
56
    case LikeFastPath::SUBSTRING:
745
56
        if (search_string.empty()) {
746
0
            *result = 1;
747
56
        } else {
748
            // Use memmem for substring search
749
56
            *result = (memmem(val.data, val.size, search_string.data(), search_string.size()) !=
750
56
                       nullptr);
751
56
        }
752
56
        return Status::OK();
753
180
    case LikeFastPath::REGEX:
754
180
    default:
755
        // Fall back to regex matching
756
180
        std::string re_pattern;
757
180
        convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
758
180
        return regexp_fn_scalar(state, StringRef(val.data, val.size),
759
180
                                {re_pattern.c_str(), re_pattern.size()}, result);
760
418
    }
761
418
}
762
763
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
764
400
                                        std::string* re_pattern) {
765
400
    re_pattern->clear();
766
767
400
    if (pattern.empty()) {
768
0
        re_pattern->append("^$");
769
0
        return;
770
0
    }
771
772
    // add ^ to pattern head to match line head
773
400
    if (!pattern.empty() && pattern[0] != '%') {
774
336
        re_pattern->append("^");
775
336
    }
776
777
    // expect % and _, all chars should keep it literal mean.
778
2.82k
    for (size_t i = 0; i < pattern.size(); i++) {
779
2.42k
        char c = pattern[i];
780
2.42k
        if (c == '\\' && i + 1 < pattern.size()) {
781
76
            char next_c = pattern[i + 1];
782
76
            if (next_c == '%' || next_c == '_') {
783
                // convert "\%" and "\_" to literal "%" and "_"
784
32
                re_pattern->append(1, next_c);
785
32
                i++;
786
32
                continue;
787
44
            } else if (next_c == '\\') {
788
                // keep valid escape "\\"
789
12
                re_pattern->append("\\\\");
790
12
                i++;
791
12
                continue;
792
12
            }
793
76
        }
794
795
2.38k
        if (c == '%') {
796
384
            re_pattern->append(".*");
797
1.99k
        } else if (c == '_') {
798
512
            re_pattern->append(".");
799
1.48k
        } else {
800
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
801
1.48k
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
802
1.48k
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
803
1.48k
                c == '.' || c == '$' || c == '?') {
804
56
                re_pattern->append(1, '\\');
805
56
            }
806
1.48k
            re_pattern->append(1, c);
807
1.48k
        }
808
2.38k
    }
809
810
    // add $ to pattern tail to match line tail
811
400
    if (!pattern.empty() && re_pattern->back() != '*') {
812
280
        re_pattern->append("$");
813
280
    }
814
400
}
815
816
830
void FunctionLike::remove_escape_character(std::string* search_string) {
817
830
    std::string tmp_search_string;
818
830
    tmp_search_string.swap(*search_string);
819
830
    int64_t len = tmp_search_string.length();
820
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
821
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
822
3.75k
    for (int i = 0; i < len;) {
823
2.92k
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
824
2.92k
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
825
88
             tmp_search_string[i + 1] == '\\')) {
826
72
            search_string->append(1, tmp_search_string[i + 1]);
827
72
            i += 2;
828
2.85k
        } else {
829
2.85k
            search_string->append(1, tmp_search_string[i]);
830
2.85k
            i++;
831
2.85k
        }
832
2.92k
    }
833
830
}
834
835
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
836
0
    if (!re.ok()) {
837
0
        return false;
838
0
    }
839
840
0
    std::vector<RE2::Arg> arguments;
841
0
    std::vector<RE2::Arg*> arguments_ptrs;
842
0
    std::size_t args_count = re.NumberOfCapturingGroups();
843
0
    arguments.resize(args_count);
844
0
    arguments_ptrs.resize(args_count);
845
0
    results.resize(args_count);
846
0
    for (std::size_t i = 0; i < args_count; ++i) {
847
0
        arguments[i] = &results[i];
848
0
        arguments_ptrs[i] = &arguments[i];
849
0
    }
850
851
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
852
0
}
853
854
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
855
0
    std::vector<std::string> results;
856
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
857
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
858
0
    if (re2_full_match(str, re, results)) {
859
0
        for (int i = 0; i < results.size(); ++i) {
860
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
861
0
        }
862
0
    } else {
863
0
        VLOG_DEBUG << "no match";
864
0
    }
865
0
}
866
867
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
868
                                                std::shared_ptr<LikeState>& state,
869
694
                                                bool try_hyperscan) {
870
694
    std::string pattern_str;
871
694
    if (state->has_custom_escape) {
872
2
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
873
692
    } else {
874
692
        pattern_str = pattern.to_string();
875
692
    }
876
694
    state->search_state.pattern_str = pattern_str;
877
694
    std::string search_string;
878
879
694
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
880
44
        state->search_state.set_search_string("");
881
44
        state->function = constant_allpass_fn;
882
44
        state->scalar_function = constant_allpass_fn_scalar;
883
650
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
884
190
        if (VLOG_DEBUG_IS_ON) {
885
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
886
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
887
0
        }
888
190
        remove_escape_character(&search_string);
889
190
        if (VLOG_DEBUG_IS_ON) {
890
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
891
0
                       << ", size: " << search_string.size();
892
0
        }
893
190
        state->search_state.set_search_string(search_string);
894
190
        state->function = constant_equals_fn;
895
190
        state->scalar_function = constant_equals_fn_scalar;
896
460
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
897
64
        if (VLOG_DEBUG_IS_ON) {
898
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
899
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
900
0
        }
901
64
        remove_escape_character(&search_string);
902
64
        if (VLOG_DEBUG_IS_ON) {
903
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
904
0
                       << ", size: " << search_string.size();
905
0
        }
906
64
        state->search_state.set_search_string(search_string);
907
64
        state->function = constant_starts_with_fn;
908
64
        state->scalar_function = constant_starts_with_fn_scalar;
909
396
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
910
64
        if (VLOG_DEBUG_IS_ON) {
911
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
912
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
913
0
        }
914
64
        remove_escape_character(&search_string);
915
64
        if (VLOG_DEBUG_IS_ON) {
916
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
917
0
                       << ", size: " << search_string.size();
918
0
        }
919
64
        state->search_state.set_search_string(search_string);
920
64
        state->function = constant_ends_with_fn;
921
64
        state->scalar_function = constant_ends_with_fn_scalar;
922
332
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
923
112
        if (VLOG_DEBUG_IS_ON) {
924
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
925
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
926
0
        }
927
112
        remove_escape_character(&search_string);
928
112
        if (VLOG_DEBUG_IS_ON) {
929
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
930
0
                       << ", size: " << search_string.size();
931
0
        }
932
112
        state->search_state.set_search_string(search_string);
933
112
        state->function = constant_substring_fn;
934
112
        state->scalar_function = constant_substring_fn_scalar;
935
220
    } else {
936
220
        std::string re_pattern;
937
220
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
938
220
        if (VLOG_DEBUG_IS_ON) {
939
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
940
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
941
0
                       << ", size: " << re_pattern.size();
942
0
        }
943
944
220
        hs_database_t* database = nullptr;
945
220
        hs_scratch_t* scratch = nullptr;
946
220
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
947
            // use hyperscan
948
220
            state->search_state.hs_database.reset(database);
949
220
            state->search_state.hs_scratch.reset(scratch);
950
220
        } else {
951
            // fallback to re2
952
            // reset hs_database to nullptr to indicate not use hyperscan
953
0
            state->search_state.hs_database.reset();
954
0
            state->search_state.hs_scratch.reset();
955
956
0
            RE2::Options opts;
957
0
            opts.set_never_nl(false);
958
0
            opts.set_dot_nl(true);
959
0
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
960
0
            if (!state->search_state.regex->ok()) {
961
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
962
0
                                             pattern_str);
963
0
            }
964
0
        }
965
966
220
        state->function = constant_regex_fn;
967
220
        state->scalar_function = constant_regex_fn_scalar;
968
220
    }
969
694
    return Status::OK();
970
694
}
971
972
2.14k
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
973
2.14k
    if (scope != FunctionContext::THREAD_LOCAL) {
974
1.07k
        return Status::OK();
975
1.07k
    }
976
1.07k
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
977
1.07k
    state->is_like_pattern = true;
978
1.07k
    state->function = like_fn;
979
1.07k
    state->scalar_function = like_fn_scalar;
980
1.07k
    if (context->is_col_constant(2)) {
981
2
        state->has_custom_escape = true;
982
2
        const auto escape_col = context->get_constant_col(2)->column_ptr;
983
2
        const auto& escape = escape_col->get_data_at(0);
984
2
        if (escape.size != 1) {
985
0
            return Status::InternalError("Escape character must be a single character, got: {}",
986
0
                                         escape.to_string());
987
0
        }
988
2
        state->escape_char = escape.data[0];
989
2
    }
990
1.07k
    if (context->is_col_constant(1)) {
991
694
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
992
694
        const auto& pattern = pattern_col->get_data_at(0);
993
694
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
994
694
    }
995
1.07k
    context->set_function_state(scope, state);
996
997
1.07k
    return Status::OK();
998
1.07k
}
999
1000
Status FunctionRegexpLike::open(FunctionContext* context,
1001
244
                                FunctionContext::FunctionStateScope scope) {
1002
244
    if (scope != FunctionContext::THREAD_LOCAL) {
1003
122
        return Status::OK();
1004
122
    }
1005
122
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
1006
122
    context->set_function_state(scope, state);
1007
122
    state->is_like_pattern = false;
1008
122
    state->function = regexp_fn;
1009
122
    state->scalar_function = regexp_fn_scalar;
1010
122
    if (context->is_col_constant(1)) {
1011
122
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
1012
122
        const auto& pattern = pattern_col->get_data_at(0);
1013
1014
122
        std::string pattern_str = pattern.to_string();
1015
122
        std::string search_string;
1016
122
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
1017
8
            state->search_state.set_search_string("");
1018
8
            state->function = constant_allpass_fn;
1019
8
            state->scalar_function = constant_allpass_fn_scalar;
1020
114
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
1021
14
            state->search_state.set_search_string(search_string);
1022
14
            state->function = constant_equals_fn;
1023
14
            state->scalar_function = constant_equals_fn_scalar;
1024
100
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
1025
14
            state->search_state.set_search_string(search_string);
1026
14
            state->function = constant_starts_with_fn;
1027
14
            state->scalar_function = constant_starts_with_fn_scalar;
1028
86
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
1029
14
            state->search_state.set_search_string(search_string);
1030
14
            state->function = constant_ends_with_fn;
1031
14
            state->scalar_function = constant_ends_with_fn_scalar;
1032
72
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
1033
28
            state->search_state.set_search_string(search_string);
1034
28
            state->function = constant_substring_fn;
1035
28
            state->scalar_function = constant_substring_fn_scalar;
1036
44
        } else {
1037
44
            hs_database_t* database = nullptr;
1038
44
            hs_scratch_t* scratch = nullptr;
1039
44
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
1040
                // use hyperscan
1041
44
                state->search_state.hs_database.reset(database);
1042
44
                state->search_state.hs_scratch.reset(scratch);
1043
44
            } else {
1044
                // fallback to re2
1045
                // reset hs_database to nullptr to indicate not use hyperscan
1046
0
                state->search_state.hs_database.reset();
1047
0
                state->search_state.hs_scratch.reset();
1048
0
                RE2::Options opts;
1049
0
                opts.set_never_nl(false);
1050
0
                opts.set_dot_nl(true);
1051
0
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1052
0
                if (!state->search_state.regex->ok()) {
1053
0
                    if (!context->state()->enable_extended_regex()) {
1054
0
                        return Status::InternalError(
1055
0
                                "Invalid regex expression: {}. Error: {}. If you need advanced "
1056
0
                                "regex features, try setting enable_extended_regex=true",
1057
0
                                pattern_str, state->search_state.regex->error());
1058
0
                    }
1059
1060
                    // RE2 failed, fallback to Boost.Regex
1061
                    // This handles advanced regex features like zero-width assertions
1062
0
                    state->search_state.regex.reset();
1063
0
                    try {
1064
0
                        state->search_state.boost_regex =
1065
0
                                std::make_unique<boost::regex>(pattern_str);
1066
0
                    } catch (const boost::regex_error& e) {
1067
0
                        return Status::InternalError("Invalid regex expression: {}. Error: {}",
1068
0
                                                     pattern_str, e.what());
1069
0
                    }
1070
0
                }
1071
0
            }
1072
44
            state->function = constant_regex_fn;
1073
44
            state->scalar_function = constant_regex_fn_scalar;
1074
44
        }
1075
122
    }
1076
122
    return Status::OK();
1077
122
}
1078
1079
2
void register_function_like(SimpleFunctionFactory& factory) {
1080
2
    factory.register_function<FunctionLike>();
1081
2
}
1082
1083
2
void register_function_regexp(SimpleFunctionFactory& factory) {
1084
2
    factory.register_function<FunctionRegexpLike>();
1085
2
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1086
2
}
1087
#include "common/compile_check_end.h"
1088
} // namespace doris