Coverage Report

Created: 2026-04-20 10:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "core/block/block.h"
31
#include "core/block/column_with_type_and_name.h"
32
#include "core/column/column.h"
33
#include "core/column/column_const.h"
34
#include "core/column/column_vector.h"
35
#include "core/string_ref.h"
36
#include "exprs/function/simple_function_factory.h"
37
38
namespace doris {
39
// A regex to match any regex pattern is equivalent to a substring search.
40
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
41
42
// A regex to match any regex pattern which is equivalent to matching a constant string
43
// at the end of the string values.
44
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
45
46
// A regex to match any regex pattern which is equivalent to matching a constant string
47
// at the end of the string values.
48
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
49
50
// A regex to match any regex pattern which is equivalent to a constant string match.
51
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
52
// A regex to match .*
53
static const RE2 ALLPASS_RE(R"((\.\*)+)");
54
55
// Like patterns
56
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
57
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
58
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
59
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
60
static const re2::RE2 LIKE_ALLPASS_RE("%+");
61
62
struct VectorAllpassSearchState : public VectorPatternSearchState {
63
424
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
64
65
    ~VectorAllpassSearchState() override = default;
66
67
397
    void like_pattern_match(const std::string& pattern_str) override {
68
397
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
69
26
            _search_strings->insert_default();
70
371
        } else {
71
371
            _pattern_matched = false;
72
371
        }
73
397
    }
74
75
32
    void regexp_pattern_match(const std::string& pattern_str) override {
76
32
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
77
0
            _search_strings->insert_default();
78
32
        } else {
79
32
            _pattern_matched = false;
80
32
        }
81
32
    }
82
};
83
84
struct VectorEqualSearchState : public VectorPatternSearchState {
85
424
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
86
87
    ~VectorEqualSearchState() override = default;
88
89
423
    void like_pattern_match(const std::string& pattern_str) override {
90
423
        _search_string.clear();
91
423
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
92
119
            FunctionLike::remove_escape_character(&_search_string);
93
119
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
94
304
        } else {
95
304
            _pattern_matched = false;
96
304
        }
97
423
    }
98
99
32
    void regexp_pattern_match(const std::string& pattern_str) override {
100
32
        _search_string.clear();
101
32
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
102
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
103
32
        } else {
104
32
            _pattern_matched = false;
105
32
        }
106
32
    }
107
};
108
109
struct VectorSubStringSearchState : public VectorPatternSearchState {
110
    VectorSubStringSearchState()
111
423
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
112
113
    ~VectorSubStringSearchState() override = default;
114
115
415
    void like_pattern_match(const std::string& pattern_str) override {
116
415
        _search_string.clear();
117
415
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
118
53
            FunctionLike::remove_escape_character(&_search_string);
119
53
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
120
362
        } else {
121
362
            _pattern_matched = false;
122
362
        }
123
415
    }
124
125
55
    void regexp_pattern_match(const std::string& pattern_str) override {
126
55
        _search_string.clear();
127
55
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
128
34
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
129
34
        } else {
130
21
            _pattern_matched = false;
131
21
        }
132
55
    }
133
};
134
135
struct VectorStartsWithSearchState : public VectorPatternSearchState {
136
    VectorStartsWithSearchState()
137
424
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
138
139
    ~VectorStartsWithSearchState() override = default;
140
141
404
    void like_pattern_match(const std::string& pattern_str) override {
142
404
        _search_string.clear();
143
404
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
144
75
            FunctionLike::remove_escape_character(&_search_string);
145
75
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
146
329
        } else {
147
329
            _pattern_matched = false;
148
329
        }
149
404
    }
150
151
50
    void regexp_pattern_match(const std::string& pattern_str) override {
152
50
        _search_string.clear();
153
50
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
154
23
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
155
27
        } else {
156
27
            _pattern_matched = false;
157
27
        }
158
50
    }
159
};
160
161
struct VectorEndsWithSearchState : public VectorPatternSearchState {
162
423
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
163
164
    ~VectorEndsWithSearchState() override = default;
165
166
403
    void like_pattern_match(const std::string& pattern_str) override {
167
403
        _search_string.clear();
168
403
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
169
42
            FunctionLike::remove_escape_character(&_search_string);
170
42
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
171
361
        } else {
172
361
            _pattern_matched = false;
173
361
        }
174
403
    }
175
176
38
    void regexp_pattern_match(const std::string& pattern_str) override {
177
38
        _search_string.clear();
178
38
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
179
9
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
180
29
        } else {
181
29
            _pattern_matched = false;
182
29
        }
183
38
    }
184
};
185
186
341
Status LikeSearchState::clone(LikeSearchState& cloned) {
187
341
    cloned.set_search_string(search_string);
188
189
341
    std::string re_pattern;
190
341
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
191
341
    if (hs_database) { // use hyperscan
192
176
        hs_database_t* database = nullptr;
193
176
        hs_scratch_t* scratch = nullptr;
194
176
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
195
196
176
        cloned.hs_database.reset(database);
197
176
        cloned.hs_scratch.reset(scratch);
198
176
    } else { // fallback to re2
199
165
        cloned.hs_database.reset();
200
165
        cloned.hs_scratch.reset();
201
202
165
        RE2::Options opts;
203
165
        opts.set_never_nl(false);
204
165
        opts.set_dot_nl(true);
205
165
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
206
165
        if (!cloned.regex->ok()) {
207
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
208
0
        }
209
165
    }
210
211
341
    return Status::OK();
212
341
}
213
214
Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals,
215
                                             const StringRef& pattern,
216
36
                                             ColumnUInt8::Container& result) {
217
36
    memset(result.data(), 1, vals.size());
218
36
    return Status::OK();
219
36
}
220
221
Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state,
222
                                                    const StringRef& val, const StringRef& pattern,
223
45
                                                    unsigned char* result) {
224
45
    *result = 1;
225
45
    return Status::OK();
226
45
}
227
228
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
229
                                           const ColumnString& search_strings,
230
21
                                           ColumnUInt8::Container& result) {
231
21
    DCHECK(vals.size() == search_strings.size());
232
21
    DCHECK(vals.size() == result.size());
233
21
    memset(result.data(), 1, vals.size());
234
21
    return Status::OK();
235
21
}
236
237
Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state,
238
                                                 const ColumnString& val, const StringRef& pattern,
239
938
                                                 ColumnUInt8::Container& result) {
240
938
    auto sz = val.size();
241
147k
    for (size_t i = 0; i < sz; i++) {
242
146k
        const auto& str_ref = val.get_data_at(i);
243
146k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
244
147k
                    str_ref.start_with(state->search_string_sv);
245
146k
    }
246
938
    return Status::OK();
247
938
}
248
249
Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state,
250
                                                        const StringRef& val,
251
                                                        const StringRef& pattern,
252
94
                                                        unsigned char* result) {
253
94
    *result = (val.size >= state->search_string_sv.size) &&
254
94
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
255
94
    return Status::OK();
256
94
}
257
258
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
259
                                               const ColumnString& search_strings,
260
68
                                               ColumnUInt8::Container& result) {
261
68
    DCHECK(vals.size() == search_strings.size());
262
68
    DCHECK(vals.size() == result.size());
263
68
    auto sz = vals.size();
264
158
    for (size_t i = 0; i < sz; ++i) {
265
90
        const auto& str_sv = vals.get_data_at(i);
266
90
        const auto& search_string_sv = search_strings.get_data_at(i);
267
90
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
268
90
    }
269
68
    return Status::OK();
270
68
}
271
272
Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state,
273
                                               const ColumnString& val, const StringRef& pattern,
274
103
                                               ColumnUInt8::Container& result) {
275
103
    auto sz = val.size();
276
5.88k
    for (size_t i = 0; i < sz; i++) {
277
5.78k
        const auto& str_ref = val.get_data_at(i);
278
5.78k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
279
5.78k
                    str_ref.end_with(state->search_string_sv);
280
5.78k
    }
281
103
    return Status::OK();
282
103
}
283
284
Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state,
285
                                                      const StringRef& val,
286
                                                      const StringRef& pattern,
287
918
                                                      unsigned char* result) {
288
918
    *result = (val.size >= state->search_string_sv.size) &&
289
918
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
290
908
                                                        state->search_string_sv.size));
291
918
    return Status::OK();
292
918
}
293
294
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
295
                                             const ColumnString& search_strings,
296
34
                                             ColumnUInt8::Container& result) {
297
34
    DCHECK(vals.size() == search_strings.size());
298
34
    DCHECK(vals.size() == result.size());
299
34
    auto sz = vals.size();
300
78
    for (size_t i = 0; i < sz; ++i) {
301
44
        const auto& str_sv = vals.get_data_at(i);
302
44
        const auto& search_string_sv = search_strings.get_data_at(i);
303
44
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
304
44
    }
305
34
    return Status::OK();
306
34
}
307
308
Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
309
                                            const StringRef& pattern,
310
50
                                            ColumnUInt8::Container& result) {
311
50
    auto sz = val.size();
312
127
    for (size_t i = 0; i < sz; i++) {
313
77
        result[i] = (val.get_data_at(i) == state->search_string_sv);
314
77
    }
315
50
    return Status::OK();
316
50
}
317
318
Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state,
319
                                                   const StringRef& val, const StringRef& pattern,
320
149
                                                   unsigned char* result) {
321
149
    *result = (val == state->search_string_sv);
322
149
    return Status::OK();
323
149
}
324
325
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
326
                                          const ColumnString& search_strings,
327
88
                                          ColumnUInt8::Container& result) {
328
88
    DCHECK(vals.size() == search_strings.size());
329
88
    DCHECK(vals.size() == result.size());
330
88
    auto sz = vals.size();
331
197
    for (size_t i = 0; i < sz; ++i) {
332
109
        const auto& str_sv = vals.get_data_at(i);
333
109
        const auto& search_string_sv = search_strings.get_data_at(i);
334
109
        result[i] = str_sv == search_string_sv;
335
109
    }
336
88
    return Status::OK();
337
88
}
338
339
Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state,
340
                                               const ColumnString& val, const StringRef& pattern,
341
0
                                               ColumnUInt8::Container& result) {
342
0
    auto sz = val.size();
343
0
    for (size_t i = 0; i < sz; i++) {
344
0
        if (state->search_string_sv.size == 0) {
345
0
            result[i] = true;
346
0
            continue;
347
0
        }
348
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
349
0
    }
350
0
    return Status::OK();
351
0
}
352
353
Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state,
354
                                                      const StringRef& val,
355
                                                      const StringRef& pattern,
356
61.2k
                                                      unsigned char* result) {
357
61.2k
    if (state->search_string_sv.size == 0) {
358
0
        *result = true;
359
0
        return Status::OK();
360
0
    }
361
61.2k
    *result = state->substring_pattern.search(val) != -1;
362
61.2k
    return Status::OK();
363
61.2k
}
364
365
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
366
                                             const ColumnString& search_strings,
367
41
                                             ColumnUInt8::Container& result) {
368
41
    DCHECK(vals.size() == search_strings.size());
369
41
    DCHECK(vals.size() == result.size());
370
41
    auto sz = vals.size();
371
107
    for (size_t i = 0; i < sz; ++i) {
372
66
        const auto& str_sv = vals.get_data_at(i);
373
66
        const auto& search_string_sv = search_strings.get_data_at(i);
374
66
        if (search_string_sv.size == 0) {
375
2
            result[i] = true;
376
2
            continue;
377
2
        }
378
64
        doris::StringSearch substring_search(&search_string_sv);
379
64
        result[i] = substring_search.search(str_sv) != -1;
380
64
    }
381
41
    return Status::OK();
382
41
}
383
384
Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state,
385
                                                  const StringRef& val, const StringRef& pattern,
386
4.08k
                                                  unsigned char* result) {
387
4.08k
    if (state->hs_database) { // use hyperscan
388
3.87k
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
389
3.87k
                           state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
390
3.87k
                           (void*)result);
391
3.87k
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
392
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
393
0
        }
394
3.87k
    } else if (state->boost_regex) { // use boost::regex for advanced features
395
4
        *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex);
396
200
    } else { // fallback to re2
397
200
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
398
200
    }
399
400
4.08k
    return Status::OK();
401
4.08k
}
402
403
Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
404
104
                                          const StringRef& pattern, unsigned char* result) {
405
104
    RE2::Options opts;
406
104
    opts.set_never_nl(false);
407
104
    opts.set_dot_nl(true);
408
104
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
409
104
    if (re.ok()) {
410
104
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
411
104
    } else {
412
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
413
0
    }
414
415
104
    return Status::OK();
416
104
}
417
418
Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
419
                                           const StringRef& pattern,
420
769
                                           ColumnUInt8::Container& result) {
421
769
    auto sz = val.size();
422
769
    if (state->hs_database) { // use hyperscan
423
741k
        for (size_t i = 0; i < sz; i++) {
424
740k
            const auto& str_ref = val.get_data_at(i);
425
740k
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
426
740k
                               state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
427
740k
                               (void*)(result.data() + i));
428
740k
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
429
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
430
0
            }
431
740k
        }
432
766
    } else if (state->boost_regex) { // use boost::regex for advanced features
433
0
        for (size_t i = 0; i < sz; i++) {
434
0
            const auto& str_ref = val.get_data_at(i);
435
0
            *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size,
436
0
                                                       *state->boost_regex);
437
0
        }
438
3
    } else { // fallback to re2
439
7
        for (size_t i = 0; i < sz; i++) {
440
4
            const auto& str_ref = val.get_data_at(i);
441
4
            *(result.data() + i) =
442
4
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
443
4
        }
444
3
    }
445
446
769
    return Status::OK();
447
769
}
448
449
Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val,
450
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
451
0
    std::string re_pattern(pattern.data, pattern.size);
452
453
0
    hs_database_t* database = nullptr;
454
0
    hs_scratch_t* scratch = nullptr;
455
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
456
0
        auto sz = val.size();
457
0
        for (size_t i = 0; i < sz; i++) {
458
0
            const auto& str_ref = val.get_data_at(i);
459
0
            auto ret =
460
0
                    hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
461
0
                            doris::LikeSearchState::hs_match_handler, (void*)(result.data() + i));
462
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
463
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
464
0
            }
465
0
        }
466
467
0
        hs_free_scratch(scratch);
468
0
        hs_free_database(database);
469
0
    } else { // fallback to re2
470
0
        RE2::Options opts;
471
0
        opts.set_never_nl(false);
472
0
        opts.set_dot_nl(true);
473
0
        re2::RE2 re(re_pattern, opts);
474
0
        if (re.ok()) {
475
0
            auto sz = val.size();
476
0
            for (size_t i = 0; i < sz; i++) {
477
0
                const auto& str_ref = val.get_data_at(i);
478
0
                *(result.data() + i) =
479
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
480
0
            }
481
0
        } else {
482
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
483
0
        }
484
0
    }
485
486
0
    return Status::OK();
487
0
}
488
489
// hyperscan compile expression to database and allocate scratch space
490
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
491
1.42k
                                    hs_database_t** database, hs_scratch_t** scratch) {
492
1.42k
    hs_compile_error_t* compile_err;
493
1.42k
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
494
1.42k
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
495
496
1.42k
    if (res != HS_SUCCESS) {
497
19
        *database = nullptr;
498
19
        std::string error_message = compile_err->message;
499
19
        hs_free_compile_error(compile_err);
500
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
501
19
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
502
19
    }
503
1.40k
    hs_free_compile_error(compile_err);
504
505
1.40k
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
506
0
        hs_free_database(*database);
507
0
        *database = nullptr;
508
0
        *scratch = nullptr;
509
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
510
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
511
0
    }
512
513
1.40k
    return Status::OK();
514
1.40k
}
515
516
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
517
                                      const ColumnNumbers& arguments, uint32_t result,
518
3.82k
                                      size_t input_rows_count) const {
519
3.82k
    const auto values_col =
520
3.82k
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
521
3.82k
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
522
523
3.82k
    if (!values) {
524
0
        return Status::InternalError("Not supported input arguments types");
525
0
    }
526
    // result column
527
3.82k
    auto res = ColumnUInt8::create();
528
3.82k
    ColumnUInt8::Container& vec_res = res->get_data();
529
    // set default value to 0, and match functions only need to set 1/true
530
3.82k
    vec_res.resize_fill(input_rows_count);
531
3.82k
    auto* state = reinterpret_cast<LikeState*>(
532
3.82k
            context->get_function_state(FunctionContext::THREAD_LOCAL));
533
    // for constant_substring_fn, use long run length search for performance
534
3.82k
    if (constant_substring_fn ==
535
3.82k
        *(state->function
536
3.82k
                  .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&,
537
3.82k
                                            const StringRef&, ColumnUInt8::Container&)>())) {
538
1.50k
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
539
1.50k
                                          &state->search_state));
540
2.31k
    } else {
541
2.31k
        const auto pattern_col = block.get_by_position(arguments[1]).column;
542
2.31k
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
543
424
            RETURN_IF_ERROR(
544
424
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
545
1.89k
        } else if (const auto* const_patterns =
546
1.89k
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
547
1.89k
            const auto& pattern_val = const_patterns->get_data_at(0);
548
1.89k
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
549
1.89k
                                         &state->search_state));
550
18.4E
        } else {
551
18.4E
            return Status::InternalError("Not supported input arguments types");
552
18.4E
        }
553
2.31k
    }
554
3.82k
    block.replace_by_position(result, std::move(res));
555
3.82k
    return Status::OK();
556
3.82k
}
557
558
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
559
                                           const ColumnString::Offsets& value_offsets,
560
                                           ColumnUInt8::Container& result,
561
1.50k
                                           LikeSearchState* search_state) const {
562
    // treat continuous multi string data as a long string data
563
1.50k
    const UInt8* begin = values.data();
564
1.50k
    const UInt8* end = begin + values.size();
565
1.50k
    const UInt8* pos = begin;
566
567
    /// Current index in the array of strings.
568
1.50k
    size_t i = 0;
569
1.50k
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
570
571
    /// We will search for the next occurrence in all strings at once.
572
205k
    while (pos < end) {
573
        // search return matched substring start offset
574
205k
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
575
205k
        if (pos >= end) {
576
1.11k
            break;
577
1.11k
        }
578
579
        /// Determine which index it refers to.
580
        /// begin + value_offsets[i] is the start offset of string at i+1
581
1.35M
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
582
1.14M
            ++i;
583
1.14M
        }
584
585
        /// We check that the entry does not pass through the boundaries of strings.
586
203k
        if (pos + needle_size <= begin + value_offsets[i]) {
587
191k
            result[i] = 1;
588
191k
        }
589
590
        // move to next string offset
591
203k
        pos = begin + value_offsets[i];
592
203k
        ++i;
593
203k
    }
594
595
1.50k
    return Status::OK();
596
1.50k
}
597
598
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
599
                                      ColumnUInt8::Container& result, const LikeFn& function,
600
1.89k
                                      LikeSearchState* search_state) const {
601
1.89k
    RETURN_IF_ERROR((function)(search_state, values,
602
1.89k
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
603
1.89k
    return Status::OK();
604
1.89k
}
605
606
template <bool LIKE_PATTERN>
607
424
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
424
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
424
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
424
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
424
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
424
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
424
    size_t size = patterns.size();
614
615
977
    for (size_t i = 0; i < size; ++i) {
616
565
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
565
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
565
            !ends_with_state->_pattern_matched) {
619
12
            return nullptr;
620
12
        }
621
553
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
553
        if (allpass_state->_pattern_matched) {
623
429
            if constexpr (LIKE_PATTERN) {
624
397
                allpass_state->like_pattern_match(pattern_str);
625
397
            } else {
626
32
                allpass_state->regexp_pattern_match(pattern_str);
627
32
            }
628
429
        }
629
553
        if (equal_state->_pattern_matched) {
630
455
            if constexpr (LIKE_PATTERN) {
631
423
                equal_state->like_pattern_match(pattern_str);
632
423
            } else {
633
32
                equal_state->regexp_pattern_match(pattern_str);
634
32
            }
635
455
        }
636
553
        if (substring_state->_pattern_matched) {
637
470
            if constexpr (LIKE_PATTERN) {
638
415
                substring_state->like_pattern_match(pattern_str);
639
415
            } else {
640
55
                substring_state->regexp_pattern_match(pattern_str);
641
55
            }
642
470
        }
643
553
        if (starts_with_state->_pattern_matched) {
644
454
            if constexpr (LIKE_PATTERN) {
645
404
                starts_with_state->like_pattern_match(pattern_str);
646
404
            } else {
647
50
                starts_with_state->regexp_pattern_match(pattern_str);
648
50
            }
649
454
        }
650
553
        if (ends_with_state->_pattern_matched) {
651
441
            if constexpr (LIKE_PATTERN) {
652
403
                ends_with_state->like_pattern_match(pattern_str);
653
403
            } else {
654
38
                ends_with_state->regexp_pattern_match(pattern_str);
655
38
            }
656
441
        }
657
553
    }
658
659
412
    if (allpass_state->_pattern_matched) {
660
21
        return allpass_state;
661
391
    } else if (equal_state->_pattern_matched) {
662
88
        return equal_state;
663
303
    } else if (substring_state->_pattern_matched) {
664
41
        return substring_state;
665
262
    } else if (starts_with_state->_pattern_matched) {
666
68
        return starts_with_state;
667
194
    } else if (ends_with_state->_pattern_matched) {
668
34
        return ends_with_state;
669
160
    } else {
670
160
        return nullptr;
671
160
    }
672
412
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
607
392
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
392
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
392
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
392
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
392
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
392
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
392
    size_t size = patterns.size();
614
615
866
    for (size_t i = 0; i < size; ++i) {
616
486
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
486
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
486
            !ends_with_state->_pattern_matched) {
619
12
            return nullptr;
620
12
        }
621
474
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
474
        if (allpass_state->_pattern_matched) {
623
397
            if constexpr (LIKE_PATTERN) {
624
397
                allpass_state->like_pattern_match(pattern_str);
625
            } else {
626
                allpass_state->regexp_pattern_match(pattern_str);
627
            }
628
397
        }
629
474
        if (equal_state->_pattern_matched) {
630
423
            if constexpr (LIKE_PATTERN) {
631
423
                equal_state->like_pattern_match(pattern_str);
632
            } else {
633
                equal_state->regexp_pattern_match(pattern_str);
634
            }
635
423
        }
636
474
        if (substring_state->_pattern_matched) {
637
415
            if constexpr (LIKE_PATTERN) {
638
415
                substring_state->like_pattern_match(pattern_str);
639
            } else {
640
                substring_state->regexp_pattern_match(pattern_str);
641
            }
642
415
        }
643
474
        if (starts_with_state->_pattern_matched) {
644
404
            if constexpr (LIKE_PATTERN) {
645
404
                starts_with_state->like_pattern_match(pattern_str);
646
            } else {
647
                starts_with_state->regexp_pattern_match(pattern_str);
648
            }
649
404
        }
650
474
        if (ends_with_state->_pattern_matched) {
651
403
            if constexpr (LIKE_PATTERN) {
652
403
                ends_with_state->like_pattern_match(pattern_str);
653
            } else {
654
                ends_with_state->regexp_pattern_match(pattern_str);
655
            }
656
403
        }
657
474
    }
658
659
380
    if (allpass_state->_pattern_matched) {
660
21
        return allpass_state;
661
359
    } else if (equal_state->_pattern_matched) {
662
88
        return equal_state;
663
271
    } else if (substring_state->_pattern_matched) {
664
30
        return substring_state;
665
241
    } else if (starts_with_state->_pattern_matched) {
666
63
        return starts_with_state;
667
178
    } else if (ends_with_state->_pattern_matched) {
668
31
        return ends_with_state;
669
147
    } else {
670
147
        return nullptr;
671
147
    }
672
380
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
607
32
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
32
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
32
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
32
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
32
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
32
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
32
    size_t size = patterns.size();
614
615
111
    for (size_t i = 0; i < size; ++i) {
616
79
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
79
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
79
            !ends_with_state->_pattern_matched) {
619
0
            return nullptr;
620
0
        }
621
79
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
79
        if (allpass_state->_pattern_matched) {
623
            if constexpr (LIKE_PATTERN) {
624
                allpass_state->like_pattern_match(pattern_str);
625
32
            } else {
626
32
                allpass_state->regexp_pattern_match(pattern_str);
627
32
            }
628
32
        }
629
79
        if (equal_state->_pattern_matched) {
630
            if constexpr (LIKE_PATTERN) {
631
                equal_state->like_pattern_match(pattern_str);
632
32
            } else {
633
32
                equal_state->regexp_pattern_match(pattern_str);
634
32
            }
635
32
        }
636
79
        if (substring_state->_pattern_matched) {
637
            if constexpr (LIKE_PATTERN) {
638
                substring_state->like_pattern_match(pattern_str);
639
55
            } else {
640
55
                substring_state->regexp_pattern_match(pattern_str);
641
55
            }
642
55
        }
643
79
        if (starts_with_state->_pattern_matched) {
644
            if constexpr (LIKE_PATTERN) {
645
                starts_with_state->like_pattern_match(pattern_str);
646
50
            } else {
647
50
                starts_with_state->regexp_pattern_match(pattern_str);
648
50
            }
649
50
        }
650
79
        if (ends_with_state->_pattern_matched) {
651
            if constexpr (LIKE_PATTERN) {
652
                ends_with_state->like_pattern_match(pattern_str);
653
38
            } else {
654
38
                ends_with_state->regexp_pattern_match(pattern_str);
655
38
            }
656
38
        }
657
79
    }
658
659
32
    if (allpass_state->_pattern_matched) {
660
0
        return allpass_state;
661
32
    } else if (equal_state->_pattern_matched) {
662
0
        return equal_state;
663
32
    } else if (substring_state->_pattern_matched) {
664
11
        return substring_state;
665
21
    } else if (starts_with_state->_pattern_matched) {
666
5
        return starts_with_state;
667
16
    } else if (ends_with_state->_pattern_matched) {
668
3
        return ends_with_state;
669
13
    } else {
670
13
        return nullptr;
671
13
    }
672
32
}
673
674
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
675
                                          ColumnUInt8::Container& result, LikeState* state,
676
424
                                          size_t input_rows_count) const {
677
424
    ColumnString::MutablePtr replaced_patterns;
678
424
    VPatternSearchStateSPtr vector_search_state;
679
424
    if (state->is_like_pattern) {
680
392
        if (state->has_custom_escape) {
681
5
            replaced_patterns = ColumnString::create();
682
10
            for (int i = 0; i < input_rows_count; ++i) {
683
5
                std::string val =
684
5
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
685
5
                replaced_patterns->insert_data(val.c_str(), val.size());
686
5
            }
687
5
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
688
387
        } else {
689
387
            vector_search_state = pattern_type_recognition<true>(patterns);
690
387
        }
691
392
    } else {
692
32
        vector_search_state = pattern_type_recognition<false>(patterns);
693
32
    }
694
695
424
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
696
697
424
    if (vector_search_state == nullptr) {
698
        // pattern type recognition failed, use default case
699
510
        for (int i = 0; i < input_rows_count; ++i) {
700
338
            const auto pattern_val = real_pattern.get_data_at(i);
701
338
            const auto value_val = values.get_data_at(i);
702
338
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
703
338
                                                     &result[i]));
704
338
        }
705
172
        return Status::OK();
706
172
    }
707
252
    const auto* search_strings =
708
252
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
709
252
    return (vector_search_state->_vector_function)(values, *search_strings, result);
710
424
}
711
712
Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val,
713
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
714
0
    std::string re_pattern;
715
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
716
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
717
0
}
718
719
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val,
720
243
                                    const StringRef& pattern, unsigned char* result) {
721
    // Try to use fast path to avoid regex compilation
722
243
    std::string search_string;
723
243
    LikeFastPath fast_path = extract_like_fast_path(pattern.data, pattern.size, search_string);
724
725
243
    switch (fast_path) {
726
12
    case LikeFastPath::ALLPASS:
727
12
        *result = 1;
728
12
        return Status::OK();
729
61
    case LikeFastPath::EQUALS:
730
61
        *result = (val.size == search_string.size() &&
731
61
                   (search_string.empty() ||
732
33
                    memcmp(val.data, search_string.data(), search_string.size()) == 0));
733
61
        return Status::OK();
734
22
    case LikeFastPath::STARTS_WITH:
735
22
        *result = (val.size >= search_string.size() &&
736
22
                   memcmp(val.data, search_string.data(), search_string.size()) == 0);
737
22
        return Status::OK();
738
16
    case LikeFastPath::ENDS_WITH:
739
16
        *result = (val.size >= search_string.size() &&
740
16
                   memcmp(val.data + val.size - search_string.size(), search_string.data(),
741
15
                          search_string.size()) == 0);
742
16
        return Status::OK();
743
34
    case LikeFastPath::SUBSTRING:
744
34
        if (search_string.empty()) {
745
0
            *result = 1;
746
34
        } else {
747
            // Use memmem for substring search
748
34
            *result = (memmem(val.data, val.size, search_string.data(), search_string.size()) !=
749
34
                       nullptr);
750
34
        }
751
34
        return Status::OK();
752
98
    case LikeFastPath::REGEX:
753
98
    default:
754
        // Fall back to regex matching
755
98
        std::string re_pattern;
756
98
        convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
757
98
        return regexp_fn_scalar(state, StringRef(val.data, val.size),
758
98
                                {re_pattern.c_str(), re_pattern.size()}, result);
759
243
    }
760
243
}
761
762
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
763
1.54k
                                        std::string* re_pattern) {
764
1.54k
    re_pattern->clear();
765
766
1.54k
    if (pattern.empty()) {
767
0
        re_pattern->append("^$");
768
0
        return;
769
0
    }
770
771
    // add ^ to pattern head to match line head
772
1.55k
    if (!pattern.empty() && pattern[0] != '%') {
773
963
        re_pattern->append("^");
774
963
    }
775
776
    // expect % and _, all chars should keep it literal mean.
777
14.4k
    for (size_t i = 0; i < pattern.size(); i++) {
778
12.8k
        char c = pattern[i];
779
12.8k
        if (c == '\\' && i + 1 < pattern.size()) {
780
526
            char next_c = pattern[i + 1];
781
526
            if (next_c == '%' || next_c == '_') {
782
                // convert "\%" and "\_" to literal "%" and "_"
783
201
                re_pattern->append(1, next_c);
784
201
                i++;
785
201
                continue;
786
325
            } else if (next_c == '\\') {
787
                // keep valid escape "\\"
788
303
                re_pattern->append("\\\\");
789
303
                i++;
790
303
                continue;
791
303
            }
792
526
        }
793
794
12.3k
        if (c == '%') {
795
1.98k
            re_pattern->append(".*");
796
10.3k
        } else if (c == '_') {
797
1.37k
            re_pattern->append(".");
798
9.02k
        } else {
799
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
800
9.02k
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
801
9.02k
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
802
9.02k
                c == '.' || c == '$' || c == '?') {
803
120
                re_pattern->append(1, '\\');
804
120
            }
805
9.02k
            re_pattern->append(1, c);
806
9.02k
        }
807
12.3k
    }
808
809
    // add $ to pattern tail to match line tail
810
1.54k
    if (!pattern.empty() && re_pattern->back() != '*') {
811
596
        re_pattern->append("$");
812
596
    }
813
1.54k
}
814
815
3.82k
void FunctionLike::remove_escape_character(std::string* search_string) {
816
3.82k
    std::string tmp_search_string;
817
3.82k
    tmp_search_string.swap(*search_string);
818
3.82k
    int64_t len = tmp_search_string.length();
819
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
820
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
821
19.3k
    for (int i = 0; i < len;) {
822
15.5k
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
823
15.5k
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
824
93
             tmp_search_string[i + 1] == '\\')) {
825
85
            search_string->append(1, tmp_search_string[i + 1]);
826
85
            i += 2;
827
15.4k
        } else {
828
15.4k
            search_string->append(1, tmp_search_string[i]);
829
15.4k
            i++;
830
15.4k
        }
831
15.5k
    }
832
3.82k
}
833
834
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
835
0
    if (!re.ok()) {
836
0
        return false;
837
0
    }
838
839
0
    std::vector<RE2::Arg> arguments;
840
0
    std::vector<RE2::Arg*> arguments_ptrs;
841
0
    std::size_t args_count = re.NumberOfCapturingGroups();
842
0
    arguments.resize(args_count);
843
0
    arguments_ptrs.resize(args_count);
844
0
    results.resize(args_count);
845
0
    for (std::size_t i = 0; i < args_count; ++i) {
846
0
        arguments[i] = &results[i];
847
0
        arguments_ptrs[i] = &arguments[i];
848
0
    }
849
850
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
851
0
}
852
853
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
854
0
    std::vector<std::string> results;
855
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
856
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
857
0
    if (re2_full_match(str, re, results)) {
858
0
        for (int i = 0; i < results.size(); ++i) {
859
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
860
0
        }
861
0
    } else {
862
0
        VLOG_DEBUG << "no match";
863
0
    }
864
0
}
865
866
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
867
                                                std::shared_ptr<LikeState>& state,
868
4.74k
                                                bool try_hyperscan) {
869
4.74k
    std::string pattern_str;
870
4.74k
    if (state->has_custom_escape) {
871
11
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
872
4.72k
    } else {
873
4.72k
        pattern_str = pattern.to_string();
874
4.72k
    }
875
4.74k
    state->search_state.pattern_str = pattern_str;
876
4.74k
    std::string search_string;
877
878
4.74k
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
879
91
        state->search_state.set_search_string("");
880
91
        state->function = constant_allpass_fn;
881
91
        state->scalar_function = constant_allpass_fn_scalar;
882
4.64k
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
883
193
        if (VLOG_DEBUG_IS_ON) {
884
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
885
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
886
0
        }
887
193
        remove_escape_character(&search_string);
888
193
        if (VLOG_DEBUG_IS_ON) {
889
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
890
0
                       << ", size: " << search_string.size();
891
0
        }
892
193
        state->search_state.set_search_string(search_string);
893
193
        state->function = constant_equals_fn;
894
193
        state->scalar_function = constant_equals_fn_scalar;
895
4.45k
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
896
1.89k
        if (VLOG_DEBUG_IS_ON) {
897
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
898
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
899
0
        }
900
1.89k
        remove_escape_character(&search_string);
901
1.89k
        if (VLOG_DEBUG_IS_ON) {
902
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
903
0
                       << ", size: " << search_string.size();
904
0
        }
905
1.89k
        state->search_state.set_search_string(search_string);
906
1.89k
        state->function = constant_starts_with_fn;
907
1.89k
        state->scalar_function = constant_starts_with_fn_scalar;
908
2.56k
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
909
204
        if (VLOG_DEBUG_IS_ON) {
910
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
911
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
912
0
        }
913
204
        remove_escape_character(&search_string);
914
204
        if (VLOG_DEBUG_IS_ON) {
915
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
916
0
                       << ", size: " << search_string.size();
917
0
        }
918
204
        state->search_state.set_search_string(search_string);
919
204
        state->function = constant_ends_with_fn;
920
204
        state->scalar_function = constant_ends_with_fn_scalar;
921
2.36k
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
922
1.25k
        if (VLOG_DEBUG_IS_ON) {
923
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
924
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
925
0
        }
926
1.25k
        remove_escape_character(&search_string);
927
1.25k
        if (VLOG_DEBUG_IS_ON) {
928
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
929
0
                       << ", size: " << search_string.size();
930
0
        }
931
1.25k
        state->search_state.set_search_string(search_string);
932
1.25k
        state->function = constant_substring_fn;
933
1.25k
        state->scalar_function = constant_substring_fn_scalar;
934
1.25k
    } else {
935
1.10k
        std::string re_pattern;
936
1.10k
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
937
1.10k
        if (VLOG_DEBUG_IS_ON) {
938
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
939
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
940
0
                       << ", size: " << re_pattern.size();
941
0
        }
942
943
1.10k
        hs_database_t* database = nullptr;
944
1.10k
        hs_scratch_t* scratch = nullptr;
945
1.10k
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
946
            // use hyperscan
947
1.00k
            state->search_state.hs_database.reset(database);
948
1.00k
            state->search_state.hs_scratch.reset(scratch);
949
1.00k
        } else {
950
            // fallback to re2
951
            // reset hs_database to nullptr to indicate not use hyperscan
952
101
            state->search_state.hs_database.reset();
953
101
            state->search_state.hs_scratch.reset();
954
955
101
            RE2::Options opts;
956
101
            opts.set_never_nl(false);
957
101
            opts.set_dot_nl(true);
958
101
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
959
101
            if (!state->search_state.regex->ok()) {
960
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
961
0
                                             pattern_str);
962
0
            }
963
101
        }
964
965
1.10k
        state->function = constant_regex_fn;
966
1.10k
        state->scalar_function = constant_regex_fn_scalar;
967
1.10k
    }
968
4.74k
    return Status::OK();
969
4.74k
}
970
971
5.93k
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
972
5.93k
    if (scope != FunctionContext::THREAD_LOCAL) {
973
1.14k
        return Status::OK();
974
1.14k
    }
975
4.79k
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
976
4.79k
    state->is_like_pattern = true;
977
4.79k
    state->function = like_fn;
978
4.79k
    state->scalar_function = like_fn_scalar;
979
4.79k
    if (context->is_col_constant(2)) {
980
11
        state->has_custom_escape = true;
981
11
        const auto escape_col = context->get_constant_col(2)->column_ptr;
982
11
        const auto& escape = escape_col->get_data_at(0);
983
11
        if (escape.size != 1) {
984
0
            return Status::InternalError("Escape character must be a single character, got: {}",
985
0
                                         escape.to_string());
986
0
        }
987
11
        state->escape_char = escape.data[0];
988
11
    }
989
4.79k
    if (context->is_col_constant(1)) {
990
4.56k
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
991
4.56k
        const auto& pattern = pattern_col->get_data_at(0);
992
4.56k
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
993
4.56k
    }
994
4.79k
    context->set_function_state(scope, state);
995
996
4.79k
    return Status::OK();
997
4.79k
}
998
999
Status FunctionRegexpLike::open(FunctionContext* context,
1000
659
                                FunctionContext::FunctionStateScope scope) {
1001
659
    if (scope != FunctionContext::THREAD_LOCAL) {
1002
129
        return Status::OK();
1003
129
    }
1004
530
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
1005
530
    context->set_function_state(scope, state);
1006
530
    state->is_like_pattern = false;
1007
530
    state->function = regexp_fn;
1008
530
    state->scalar_function = regexp_fn_scalar;
1009
530
    if (context->is_col_constant(1)) {
1010
483
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
1011
483
        const auto& pattern = pattern_col->get_data_at(0);
1012
1013
483
        std::string pattern_str = pattern.to_string();
1014
483
        std::string search_string;
1015
483
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
1016
24
            state->search_state.set_search_string("");
1017
24
            state->function = constant_allpass_fn;
1018
24
            state->scalar_function = constant_allpass_fn_scalar;
1019
459
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
1020
7
            state->search_state.set_search_string(search_string);
1021
7
            state->function = constant_equals_fn;
1022
7
            state->scalar_function = constant_equals_fn_scalar;
1023
452
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
1024
67
            state->search_state.set_search_string(search_string);
1025
67
            state->function = constant_starts_with_fn;
1026
67
            state->scalar_function = constant_starts_with_fn_scalar;
1027
385
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
1028
59
            state->search_state.set_search_string(search_string);
1029
59
            state->function = constant_ends_with_fn;
1030
59
            state->scalar_function = constant_ends_with_fn_scalar;
1031
326
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
1032
78
            state->search_state.set_search_string(search_string);
1033
78
            state->function = constant_substring_fn;
1034
78
            state->scalar_function = constant_substring_fn_scalar;
1035
248
        } else {
1036
248
            hs_database_t* database = nullptr;
1037
248
            hs_scratch_t* scratch = nullptr;
1038
248
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
1039
                // use hyperscan
1040
229
                state->search_state.hs_database.reset(database);
1041
229
                state->search_state.hs_scratch.reset(scratch);
1042
229
            } else {
1043
                // fallback to re2
1044
                // reset hs_database to nullptr to indicate not use hyperscan
1045
19
                state->search_state.hs_database.reset();
1046
19
                state->search_state.hs_scratch.reset();
1047
19
                RE2::Options opts;
1048
19
                opts.set_never_nl(false);
1049
19
                opts.set_dot_nl(true);
1050
19
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1051
19
                if (!state->search_state.regex->ok()) {
1052
9
                    if (!context->state()->enable_extended_regex()) {
1053
1
                        return Status::InternalError(
1054
1
                                "Invalid regex expression: {}. Error: {}. If you need advanced "
1055
1
                                "regex features, try setting enable_extended_regex=true",
1056
1
                                pattern_str, state->search_state.regex->error());
1057
1
                    }
1058
1059
                    // RE2 failed, fallback to Boost.Regex
1060
                    // This handles advanced regex features like zero-width assertions
1061
8
                    state->search_state.regex.reset();
1062
8
                    try {
1063
8
                        state->search_state.boost_regex =
1064
8
                                std::make_unique<boost::regex>(pattern_str);
1065
8
                    } catch (const boost::regex_error& e) {
1066
0
                        return Status::InternalError("Invalid regex expression: {}. Error: {}",
1067
0
                                                     pattern_str, e.what());
1068
0
                    }
1069
8
                }
1070
19
            }
1071
247
            state->function = constant_regex_fn;
1072
247
            state->scalar_function = constant_regex_fn_scalar;
1073
247
        }
1074
483
    }
1075
529
    return Status::OK();
1076
530
}
1077
1078
8
void register_function_like(SimpleFunctionFactory& factory) {
1079
8
    factory.register_function<FunctionLike>();
1080
8
}
1081
1082
8
void register_function_regexp(SimpleFunctionFactory& factory) {
1083
8
    factory.register_function<FunctionRegexpLike>();
1084
8
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1085
8
}
1086
} // namespace doris