Coverage Report

Created: 2026-03-16 13:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "core/block/block.h"
31
#include "core/block/column_with_type_and_name.h"
32
#include "core/column/column.h"
33
#include "core/column/column_const.h"
34
#include "core/column/column_vector.h"
35
#include "core/string_ref.h"
36
#include "exprs/function/simple_function_factory.h"
37
38
namespace doris {
39
#include "common/compile_check_begin.h"
40
// A regex to match any regex pattern is equivalent to a substring search.
41
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
42
43
// A regex to match any regex pattern which is equivalent to matching a constant string
44
// at the end of the string values.
45
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
46
47
// A regex to match any regex pattern which is equivalent to matching a constant string
48
// at the end of the string values.
49
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
50
51
// A regex to match any regex pattern which is equivalent to a constant string match.
52
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
53
// A regex to match .*
54
static const RE2 ALLPASS_RE(R"((\.\*)+)");
55
56
// Like patterns
57
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
58
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
59
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
60
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
61
static const re2::RE2 LIKE_ALLPASS_RE("%+");
62
63
struct VectorAllpassSearchState : public VectorPatternSearchState {
64
451
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
65
66
    ~VectorAllpassSearchState() override = default;
67
68
424
    void like_pattern_match(const std::string& pattern_str) override {
69
424
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
70
26
            _search_strings->insert_default();
71
398
        } else {
72
398
            _pattern_matched = false;
73
398
        }
74
424
    }
75
76
32
    void regexp_pattern_match(const std::string& pattern_str) override {
77
32
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
78
0
            _search_strings->insert_default();
79
32
        } else {
80
32
            _pattern_matched = false;
81
32
        }
82
32
    }
83
};
84
85
struct VectorEqualSearchState : public VectorPatternSearchState {
86
451
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
87
88
    ~VectorEqualSearchState() override = default;
89
90
450
    void like_pattern_match(const std::string& pattern_str) override {
91
450
        _search_string.clear();
92
450
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
93
119
            FunctionLike::remove_escape_character(&_search_string);
94
119
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
95
331
        } else {
96
331
            _pattern_matched = false;
97
331
        }
98
450
    }
99
100
32
    void regexp_pattern_match(const std::string& pattern_str) override {
101
32
        _search_string.clear();
102
32
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
103
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
104
32
        } else {
105
32
            _pattern_matched = false;
106
32
        }
107
32
    }
108
};
109
110
struct VectorSubStringSearchState : public VectorPatternSearchState {
111
    VectorSubStringSearchState()
112
451
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
113
114
    ~VectorSubStringSearchState() override = default;
115
116
442
    void like_pattern_match(const std::string& pattern_str) override {
117
442
        _search_string.clear();
118
442
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
119
53
            FunctionLike::remove_escape_character(&_search_string);
120
53
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
121
389
        } else {
122
389
            _pattern_matched = false;
123
389
        }
124
442
    }
125
126
55
    void regexp_pattern_match(const std::string& pattern_str) override {
127
55
        _search_string.clear();
128
55
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
129
34
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
130
34
        } else {
131
21
            _pattern_matched = false;
132
21
        }
133
55
    }
134
};
135
136
struct VectorStartsWithSearchState : public VectorPatternSearchState {
137
    VectorStartsWithSearchState()
138
451
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
139
140
    ~VectorStartsWithSearchState() override = default;
141
142
431
    void like_pattern_match(const std::string& pattern_str) override {
143
431
        _search_string.clear();
144
431
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
145
83
            FunctionLike::remove_escape_character(&_search_string);
146
83
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
147
348
        } else {
148
348
            _pattern_matched = false;
149
348
        }
150
431
    }
151
152
50
    void regexp_pattern_match(const std::string& pattern_str) override {
153
50
        _search_string.clear();
154
50
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
155
23
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
156
27
        } else {
157
27
            _pattern_matched = false;
158
27
        }
159
50
    }
160
};
161
162
struct VectorEndsWithSearchState : public VectorPatternSearchState {
163
451
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
164
165
    ~VectorEndsWithSearchState() override = default;
166
167
430
    void like_pattern_match(const std::string& pattern_str) override {
168
430
        _search_string.clear();
169
430
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
170
42
            FunctionLike::remove_escape_character(&_search_string);
171
42
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
172
388
        } else {
173
388
            _pattern_matched = false;
174
388
        }
175
430
    }
176
177
38
    void regexp_pattern_match(const std::string& pattern_str) override {
178
38
        _search_string.clear();
179
38
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
180
9
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
181
29
        } else {
182
29
            _pattern_matched = false;
183
29
        }
184
38
    }
185
};
186
187
419
Status LikeSearchState::clone(LikeSearchState& cloned) {
188
419
    cloned.set_search_string(search_string);
189
190
419
    std::string re_pattern;
191
419
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
192
419
    if (hs_database) { // use hyperscan
193
159
        hs_database_t* database = nullptr;
194
159
        hs_scratch_t* scratch = nullptr;
195
159
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
196
197
159
        cloned.hs_database.reset(database);
198
159
        cloned.hs_scratch.reset(scratch);
199
260
    } else { // fallback to re2
200
260
        cloned.hs_database.reset();
201
260
        cloned.hs_scratch.reset();
202
203
260
        RE2::Options opts;
204
260
        opts.set_never_nl(false);
205
260
        opts.set_dot_nl(true);
206
260
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
207
260
        if (!cloned.regex->ok()) {
208
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
209
0
        }
210
260
    }
211
212
419
    return Status::OK();
213
419
}
214
215
Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals,
216
                                             const StringRef& pattern,
217
49
                                             ColumnUInt8::Container& result) {
218
49
    memset(result.data(), 1, vals.size());
219
49
    return Status::OK();
220
49
}
221
222
Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state,
223
                                                    const StringRef& val, const StringRef& pattern,
224
72
                                                    unsigned char* result) {
225
72
    *result = 1;
226
72
    return Status::OK();
227
72
}
228
229
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
230
                                           const ColumnString& search_strings,
231
21
                                           ColumnUInt8::Container& result) {
232
21
    DCHECK(vals.size() == search_strings.size());
233
21
    DCHECK(vals.size() == result.size());
234
21
    memset(result.data(), 1, vals.size());
235
21
    return Status::OK();
236
21
}
237
238
Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state,
239
                                                 const ColumnString& val, const StringRef& pattern,
240
273
                                                 ColumnUInt8::Container& result) {
241
273
    auto sz = val.size();
242
68.5k
    for (size_t i = 0; i < sz; i++) {
243
68.2k
        const auto& str_ref = val.get_data_at(i);
244
68.2k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
245
68.3k
                    str_ref.start_with(state->search_string_sv);
246
68.2k
    }
247
273
    return Status::OK();
248
273
}
249
250
Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state,
251
                                                        const StringRef& val,
252
                                                        const StringRef& pattern,
253
192k
                                                        unsigned char* result) {
254
192k
    *result = (val.size >= state->search_string_sv.size) &&
255
193k
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
256
192k
    return Status::OK();
257
192k
}
258
259
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
260
                                               const ColumnString& search_strings,
261
76
                                               ColumnUInt8::Container& result) {
262
76
    DCHECK(vals.size() == search_strings.size());
263
76
    DCHECK(vals.size() == result.size());
264
76
    auto sz = vals.size();
265
174
    for (size_t i = 0; i < sz; ++i) {
266
98
        const auto& str_sv = vals.get_data_at(i);
267
98
        const auto& search_string_sv = search_strings.get_data_at(i);
268
98
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
269
98
    }
270
76
    return Status::OK();
271
76
}
272
273
Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state,
274
                                               const ColumnString& val, const StringRef& pattern,
275
129
                                               ColumnUInt8::Container& result) {
276
129
    auto sz = val.size();
277
5.21k
    for (size_t i = 0; i < sz; i++) {
278
5.09k
        const auto& str_ref = val.get_data_at(i);
279
5.09k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
280
5.09k
                    str_ref.end_with(state->search_string_sv);
281
5.09k
    }
282
129
    return Status::OK();
283
129
}
284
285
Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state,
286
                                                      const StringRef& val,
287
                                                      const StringRef& pattern,
288
4.38k
                                                      unsigned char* result) {
289
4.38k
    *result = (val.size >= state->search_string_sv.size) &&
290
4.38k
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
291
4.37k
                                                        state->search_string_sv.size));
292
4.38k
    return Status::OK();
293
4.38k
}
294
295
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
296
                                             const ColumnString& search_strings,
297
34
                                             ColumnUInt8::Container& result) {
298
34
    DCHECK(vals.size() == search_strings.size());
299
34
    DCHECK(vals.size() == result.size());
300
34
    auto sz = vals.size();
301
78
    for (size_t i = 0; i < sz; ++i) {
302
44
        const auto& str_sv = vals.get_data_at(i);
303
44
        const auto& search_string_sv = search_strings.get_data_at(i);
304
44
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
305
44
    }
306
34
    return Status::OK();
307
34
}
308
309
Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
310
                                            const StringRef& pattern,
311
50
                                            ColumnUInt8::Container& result) {
312
50
    auto sz = val.size();
313
127
    for (size_t i = 0; i < sz; i++) {
314
77
        result[i] = (val.get_data_at(i) == state->search_string_sv);
315
77
    }
316
50
    return Status::OK();
317
50
}
318
319
Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state,
320
                                                   const StringRef& val, const StringRef& pattern,
321
150
                                                   unsigned char* result) {
322
150
    *result = (val == state->search_string_sv);
323
150
    return Status::OK();
324
150
}
325
326
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
327
                                          const ColumnString& search_strings,
328
88
                                          ColumnUInt8::Container& result) {
329
88
    DCHECK(vals.size() == search_strings.size());
330
88
    DCHECK(vals.size() == result.size());
331
88
    auto sz = vals.size();
332
197
    for (size_t i = 0; i < sz; ++i) {
333
109
        const auto& str_sv = vals.get_data_at(i);
334
109
        const auto& search_string_sv = search_strings.get_data_at(i);
335
109
        result[i] = str_sv == search_string_sv;
336
109
    }
337
88
    return Status::OK();
338
88
}
339
340
Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state,
341
                                               const ColumnString& val, const StringRef& pattern,
342
0
                                               ColumnUInt8::Container& result) {
343
0
    auto sz = val.size();
344
0
    for (size_t i = 0; i < sz; i++) {
345
0
        if (state->search_string_sv.size == 0) {
346
0
            result[i] = true;
347
0
            continue;
348
0
        }
349
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
350
0
    }
351
0
    return Status::OK();
352
0
}
353
354
Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state,
355
                                                      const StringRef& val,
356
                                                      const StringRef& pattern,
357
401k
                                                      unsigned char* result) {
358
401k
    if (state->search_string_sv.size == 0) {
359
0
        *result = true;
360
0
        return Status::OK();
361
0
    }
362
401k
    *result = state->substring_pattern.search(val) != -1;
363
401k
    return Status::OK();
364
401k
}
365
366
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
367
                                             const ColumnString& search_strings,
368
41
                                             ColumnUInt8::Container& result) {
369
41
    DCHECK(vals.size() == search_strings.size());
370
41
    DCHECK(vals.size() == result.size());
371
41
    auto sz = vals.size();
372
107
    for (size_t i = 0; i < sz; ++i) {
373
66
        const auto& str_sv = vals.get_data_at(i);
374
66
        const auto& search_string_sv = search_strings.get_data_at(i);
375
66
        if (search_string_sv.size == 0) {
376
2
            result[i] = true;
377
2
            continue;
378
2
        }
379
64
        doris::StringSearch substring_search(&search_string_sv);
380
64
        result[i] = substring_search.search(str_sv) != -1;
381
64
    }
382
41
    return Status::OK();
383
41
}
384
385
Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state,
386
                                                  const StringRef& val, const StringRef& pattern,
387
1.94k
                                                  unsigned char* result) {
388
1.94k
    if (state->hs_database) { // use hyperscan
389
1.76k
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
390
1.76k
                           state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
391
1.76k
                           (void*)result);
392
1.76k
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
393
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
394
0
        }
395
1.76k
    } else if (state->boost_regex) { // use boost::regex for advanced features
396
4
        *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex);
397
178
    } else { // fallback to re2
398
178
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
399
178
    }
400
401
1.94k
    return Status::OK();
402
1.94k
}
403
404
Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
405
104
                                          const StringRef& pattern, unsigned char* result) {
406
104
    RE2::Options opts;
407
104
    opts.set_never_nl(false);
408
104
    opts.set_dot_nl(true);
409
104
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
410
104
    if (re.ok()) {
411
104
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
412
104
    } else {
413
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
414
0
    }
415
416
104
    return Status::OK();
417
104
}
418
419
Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
420
                                           const StringRef& pattern,
421
555
                                           ColumnUInt8::Container& result) {
422
555
    auto sz = val.size();
423
555
    if (state->hs_database) { // use hyperscan
424
892k
        for (size_t i = 0; i < sz; i++) {
425
891k
            const auto& str_ref = val.get_data_at(i);
426
891k
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
427
891k
                               state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
428
891k
                               (void*)(result.data() + i));
429
891k
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
430
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
431
0
            }
432
891k
        }
433
552
    } else if (state->boost_regex) { // use boost::regex for advanced features
434
0
        for (size_t i = 0; i < sz; i++) {
435
0
            const auto& str_ref = val.get_data_at(i);
436
0
            *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size,
437
0
                                                       *state->boost_regex);
438
0
        }
439
3
    } else { // fallback to re2
440
7
        for (size_t i = 0; i < sz; i++) {
441
4
            const auto& str_ref = val.get_data_at(i);
442
4
            *(result.data() + i) =
443
4
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
444
4
        }
445
3
    }
446
447
555
    return Status::OK();
448
555
}
449
450
Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val,
451
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
452
0
    std::string re_pattern(pattern.data, pattern.size);
453
454
0
    hs_database_t* database = nullptr;
455
0
    hs_scratch_t* scratch = nullptr;
456
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
457
0
        auto sz = val.size();
458
0
        for (size_t i = 0; i < sz; i++) {
459
0
            const auto& str_ref = val.get_data_at(i);
460
0
            auto ret =
461
0
                    hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
462
0
                            doris::LikeSearchState::hs_match_handler, (void*)(result.data() + i));
463
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
464
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
465
0
            }
466
0
        }
467
468
0
        hs_free_scratch(scratch);
469
0
        hs_free_database(database);
470
0
    } else { // fallback to re2
471
0
        RE2::Options opts;
472
0
        opts.set_never_nl(false);
473
0
        opts.set_dot_nl(true);
474
0
        re2::RE2 re(re_pattern, opts);
475
0
        if (re.ok()) {
476
0
            auto sz = val.size();
477
0
            for (size_t i = 0; i < sz; i++) {
478
0
                const auto& str_ref = val.get_data_at(i);
479
0
                *(result.data() + i) =
480
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
481
0
            }
482
0
        } else {
483
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
484
0
        }
485
0
    }
486
487
0
    return Status::OK();
488
0
}
489
490
// hyperscan compile expression to database and allocate scratch space
491
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
492
1.47k
                                    hs_database_t** database, hs_scratch_t** scratch) {
493
1.47k
    hs_compile_error_t* compile_err;
494
1.47k
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
495
1.47k
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
496
497
1.47k
    if (res != HS_SUCCESS) {
498
19
        *database = nullptr;
499
19
        std::string error_message = compile_err->message;
500
19
        hs_free_compile_error(compile_err);
501
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
502
19
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
503
19
    }
504
1.45k
    hs_free_compile_error(compile_err);
505
506
1.45k
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
507
0
        hs_free_database(*database);
508
0
        *database = nullptr;
509
0
        *scratch = nullptr;
510
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
511
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
512
0
    }
513
514
1.45k
    return Status::OK();
515
1.45k
}
516
517
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
518
                                      const ColumnNumbers& arguments, uint32_t result,
519
1.73k
                                      size_t input_rows_count) const {
520
1.73k
    const auto values_col =
521
1.73k
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
522
1.73k
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
523
524
1.73k
    if (!values) {
525
0
        return Status::InternalError("Not supported input arguments types");
526
0
    }
527
    // result column
528
1.73k
    auto res = ColumnUInt8::create();
529
1.73k
    ColumnUInt8::Container& vec_res = res->get_data();
530
    // set default value to 0, and match functions only need to set 1/true
531
1.73k
    vec_res.resize_fill(input_rows_count);
532
1.73k
    auto* state = reinterpret_cast<LikeState*>(
533
1.73k
            context->get_function_state(FunctionContext::THREAD_LOCAL));
534
    // for constant_substring_fn, use long run length search for performance
535
1.73k
    if (constant_substring_fn ==
536
1.73k
        *(state->function
537
1.73k
                  .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&,
538
1.73k
                                            const StringRef&, ColumnUInt8::Container&)>())) {
539
231
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
540
231
                                          &state->search_state));
541
1.50k
    } else {
542
1.50k
        const auto pattern_col = block.get_by_position(arguments[1]).column;
543
1.50k
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
544
451
            RETURN_IF_ERROR(
545
451
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
546
1.05k
        } else if (const auto* const_patterns =
547
1.05k
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
548
1.05k
            const auto& pattern_val = const_patterns->get_data_at(0);
549
1.05k
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
550
1.05k
                                         &state->search_state));
551
18.4E
        } else {
552
18.4E
            return Status::InternalError("Not supported input arguments types");
553
18.4E
        }
554
1.50k
    }
555
1.73k
    block.replace_by_position(result, std::move(res));
556
1.73k
    return Status::OK();
557
1.73k
}
558
559
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
560
                                           const ColumnString::Offsets& value_offsets,
561
                                           ColumnUInt8::Container& result,
562
231
                                           LikeSearchState* search_state) const {
563
    // treat continuous multi string data as a long string data
564
231
    const UInt8* begin = values.data();
565
231
    const UInt8* end = begin + values.size();
566
231
    const UInt8* pos = begin;
567
568
    /// Current index in the array of strings.
569
231
    size_t i = 0;
570
231
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
571
572
    /// We will search for the next occurrence in all strings at once.
573
1.49k
    while (pos < end) {
574
        // search return matched substring start offset
575
1.35k
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
576
1.35k
        if (pos >= end) {
577
95
            break;
578
95
        }
579
580
        /// Determine which index it refers to.
581
        /// begin + value_offsets[i] is the start offset of string at i+1
582
1.29k
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
583
29
            ++i;
584
29
        }
585
586
        /// We check that the entry does not pass through the boundaries of strings.
587
1.26k
        if (pos + needle_size <= begin + value_offsets[i]) {
588
1.24k
            result[i] = 1;
589
1.24k
        }
590
591
        // move to next string offset
592
1.26k
        pos = begin + value_offsets[i];
593
1.26k
        ++i;
594
1.26k
    }
595
596
231
    return Status::OK();
597
231
}
598
599
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
600
                                      ColumnUInt8::Container& result, const LikeFn& function,
601
1.05k
                                      LikeSearchState* search_state) const {
602
1.05k
    RETURN_IF_ERROR((function)(search_state, values,
603
1.05k
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
604
1.05k
    return Status::OK();
605
1.05k
}
606
607
template <bool LIKE_PATTERN>
608
451
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
451
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
451
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
451
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
451
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
451
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
451
    size_t size = patterns.size();
615
616
1.03k
    for (size_t i = 0; i < size; ++i) {
617
592
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
592
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
592
            !ends_with_state->_pattern_matched) {
620
12
            return nullptr;
621
12
        }
622
580
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
580
        if (allpass_state->_pattern_matched) {
624
456
            if constexpr (LIKE_PATTERN) {
625
424
                allpass_state->like_pattern_match(pattern_str);
626
424
            } else {
627
32
                allpass_state->regexp_pattern_match(pattern_str);
628
32
            }
629
456
        }
630
580
        if (equal_state->_pattern_matched) {
631
482
            if constexpr (LIKE_PATTERN) {
632
450
                equal_state->like_pattern_match(pattern_str);
633
450
            } else {
634
32
                equal_state->regexp_pattern_match(pattern_str);
635
32
            }
636
482
        }
637
580
        if (substring_state->_pattern_matched) {
638
497
            if constexpr (LIKE_PATTERN) {
639
442
                substring_state->like_pattern_match(pattern_str);
640
442
            } else {
641
55
                substring_state->regexp_pattern_match(pattern_str);
642
55
            }
643
497
        }
644
580
        if (starts_with_state->_pattern_matched) {
645
481
            if constexpr (LIKE_PATTERN) {
646
431
                starts_with_state->like_pattern_match(pattern_str);
647
431
            } else {
648
50
                starts_with_state->regexp_pattern_match(pattern_str);
649
50
            }
650
481
        }
651
580
        if (ends_with_state->_pattern_matched) {
652
468
            if constexpr (LIKE_PATTERN) {
653
430
                ends_with_state->like_pattern_match(pattern_str);
654
430
            } else {
655
38
                ends_with_state->regexp_pattern_match(pattern_str);
656
38
            }
657
468
        }
658
580
    }
659
660
439
    if (allpass_state->_pattern_matched) {
661
21
        return allpass_state;
662
418
    } else if (equal_state->_pattern_matched) {
663
88
        return equal_state;
664
330
    } else if (substring_state->_pattern_matched) {
665
41
        return substring_state;
666
289
    } else if (starts_with_state->_pattern_matched) {
667
76
        return starts_with_state;
668
213
    } else if (ends_with_state->_pattern_matched) {
669
34
        return ends_with_state;
670
179
    } else {
671
179
        return nullptr;
672
179
    }
673
439
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
608
419
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
419
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
419
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
419
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
419
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
419
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
419
    size_t size = patterns.size();
615
616
920
    for (size_t i = 0; i < size; ++i) {
617
513
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
513
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
513
            !ends_with_state->_pattern_matched) {
620
12
            return nullptr;
621
12
        }
622
501
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
501
        if (allpass_state->_pattern_matched) {
624
424
            if constexpr (LIKE_PATTERN) {
625
424
                allpass_state->like_pattern_match(pattern_str);
626
            } else {
627
                allpass_state->regexp_pattern_match(pattern_str);
628
            }
629
424
        }
630
501
        if (equal_state->_pattern_matched) {
631
450
            if constexpr (LIKE_PATTERN) {
632
450
                equal_state->like_pattern_match(pattern_str);
633
            } else {
634
                equal_state->regexp_pattern_match(pattern_str);
635
            }
636
450
        }
637
501
        if (substring_state->_pattern_matched) {
638
442
            if constexpr (LIKE_PATTERN) {
639
442
                substring_state->like_pattern_match(pattern_str);
640
            } else {
641
                substring_state->regexp_pattern_match(pattern_str);
642
            }
643
442
        }
644
501
        if (starts_with_state->_pattern_matched) {
645
431
            if constexpr (LIKE_PATTERN) {
646
431
                starts_with_state->like_pattern_match(pattern_str);
647
            } else {
648
                starts_with_state->regexp_pattern_match(pattern_str);
649
            }
650
431
        }
651
501
        if (ends_with_state->_pattern_matched) {
652
430
            if constexpr (LIKE_PATTERN) {
653
430
                ends_with_state->like_pattern_match(pattern_str);
654
            } else {
655
                ends_with_state->regexp_pattern_match(pattern_str);
656
            }
657
430
        }
658
501
    }
659
660
407
    if (allpass_state->_pattern_matched) {
661
21
        return allpass_state;
662
386
    } else if (equal_state->_pattern_matched) {
663
88
        return equal_state;
664
298
    } else if (substring_state->_pattern_matched) {
665
30
        return substring_state;
666
268
    } else if (starts_with_state->_pattern_matched) {
667
71
        return starts_with_state;
668
197
    } else if (ends_with_state->_pattern_matched) {
669
31
        return ends_with_state;
670
166
    } else {
671
166
        return nullptr;
672
166
    }
673
407
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
608
32
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
609
32
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
610
32
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
611
32
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
612
32
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
613
32
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
614
32
    size_t size = patterns.size();
615
616
111
    for (size_t i = 0; i < size; ++i) {
617
79
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
618
79
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
619
79
            !ends_with_state->_pattern_matched) {
620
0
            return nullptr;
621
0
        }
622
79
        std::string pattern_str = patterns.get_data_at(i).to_string();
623
79
        if (allpass_state->_pattern_matched) {
624
            if constexpr (LIKE_PATTERN) {
625
                allpass_state->like_pattern_match(pattern_str);
626
32
            } else {
627
32
                allpass_state->regexp_pattern_match(pattern_str);
628
32
            }
629
32
        }
630
79
        if (equal_state->_pattern_matched) {
631
            if constexpr (LIKE_PATTERN) {
632
                equal_state->like_pattern_match(pattern_str);
633
32
            } else {
634
32
                equal_state->regexp_pattern_match(pattern_str);
635
32
            }
636
32
        }
637
79
        if (substring_state->_pattern_matched) {
638
            if constexpr (LIKE_PATTERN) {
639
                substring_state->like_pattern_match(pattern_str);
640
55
            } else {
641
55
                substring_state->regexp_pattern_match(pattern_str);
642
55
            }
643
55
        }
644
79
        if (starts_with_state->_pattern_matched) {
645
            if constexpr (LIKE_PATTERN) {
646
                starts_with_state->like_pattern_match(pattern_str);
647
50
            } else {
648
50
                starts_with_state->regexp_pattern_match(pattern_str);
649
50
            }
650
50
        }
651
79
        if (ends_with_state->_pattern_matched) {
652
            if constexpr (LIKE_PATTERN) {
653
                ends_with_state->like_pattern_match(pattern_str);
654
38
            } else {
655
38
                ends_with_state->regexp_pattern_match(pattern_str);
656
38
            }
657
38
        }
658
79
    }
659
660
32
    if (allpass_state->_pattern_matched) {
661
0
        return allpass_state;
662
32
    } else if (equal_state->_pattern_matched) {
663
0
        return equal_state;
664
32
    } else if (substring_state->_pattern_matched) {
665
11
        return substring_state;
666
21
    } else if (starts_with_state->_pattern_matched) {
667
5
        return starts_with_state;
668
16
    } else if (ends_with_state->_pattern_matched) {
669
3
        return ends_with_state;
670
13
    } else {
671
13
        return nullptr;
672
13
    }
673
32
}
674
675
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
676
                                          ColumnUInt8::Container& result, LikeState* state,
677
451
                                          size_t input_rows_count) const {
678
451
    ColumnString::MutablePtr replaced_patterns;
679
451
    VPatternSearchStateSPtr vector_search_state;
680
451
    if (state->is_like_pattern) {
681
419
        if (state->has_custom_escape) {
682
5
            replaced_patterns = ColumnString::create();
683
10
            for (int i = 0; i < input_rows_count; ++i) {
684
5
                std::string val =
685
5
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
686
5
                replaced_patterns->insert_data(val.c_str(), val.size());
687
5
            }
688
5
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
689
414
        } else {
690
414
            vector_search_state = pattern_type_recognition<true>(patterns);
691
414
        }
692
419
    } else {
693
32
        vector_search_state = pattern_type_recognition<false>(patterns);
694
32
    }
695
696
451
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
697
698
451
    if (vector_search_state == nullptr) {
699
        // pattern type recognition failed, use default case
700
548
        for (int i = 0; i < input_rows_count; ++i) {
701
357
            const auto pattern_val = real_pattern.get_data_at(i);
702
357
            const auto value_val = values.get_data_at(i);
703
357
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
704
357
                                                     &result[i]));
705
357
        }
706
191
        return Status::OK();
707
191
    }
708
260
    const auto* search_strings =
709
260
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
710
260
    return (vector_search_state->_vector_function)(values, *search_strings, result);
711
451
}
712
713
Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val,
714
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
715
0
    std::string re_pattern;
716
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
717
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
718
0
}
719
720
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val,
721
243
                                    const StringRef& pattern, unsigned char* result) {
722
    // Try to use fast path to avoid regex compilation
723
243
    std::string search_string;
724
243
    LikeFastPath fast_path = extract_like_fast_path(pattern.data, pattern.size, search_string);
725
726
243
    switch (fast_path) {
727
12
    case LikeFastPath::ALLPASS:
728
12
        *result = 1;
729
12
        return Status::OK();
730
61
    case LikeFastPath::EQUALS:
731
61
        *result = (val.size == search_string.size() &&
732
61
                   (search_string.empty() ||
733
33
                    memcmp(val.data, search_string.data(), search_string.size()) == 0));
734
61
        return Status::OK();
735
22
    case LikeFastPath::STARTS_WITH:
736
22
        *result = (val.size >= search_string.size() &&
737
22
                   memcmp(val.data, search_string.data(), search_string.size()) == 0);
738
22
        return Status::OK();
739
16
    case LikeFastPath::ENDS_WITH:
740
16
        *result = (val.size >= search_string.size() &&
741
16
                   memcmp(val.data + val.size - search_string.size(), search_string.data(),
742
15
                          search_string.size()) == 0);
743
16
        return Status::OK();
744
34
    case LikeFastPath::SUBSTRING:
745
34
        if (search_string.empty()) {
746
0
            *result = 1;
747
34
        } else {
748
            // Use memmem for substring search
749
34
            *result = (memmem(val.data, val.size, search_string.data(), search_string.size()) !=
750
34
                       nullptr);
751
34
        }
752
34
        return Status::OK();
753
98
    case LikeFastPath::REGEX:
754
98
    default:
755
        // Fall back to regex matching
756
98
        std::string re_pattern;
757
98
        convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
758
98
        return regexp_fn_scalar(state, StringRef(val.data, val.size),
759
98
                                {re_pattern.c_str(), re_pattern.size()}, result);
760
243
    }
761
243
}
762
763
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
764
1.71k
                                        std::string* re_pattern) {
765
1.71k
    re_pattern->clear();
766
767
1.71k
    if (pattern.empty()) {
768
0
        re_pattern->append("^$");
769
0
        return;
770
0
    }
771
772
    // add ^ to pattern head to match line head
773
1.71k
    if (!pattern.empty() && pattern[0] != '%') {
774
1.07k
        re_pattern->append("^");
775
1.07k
    }
776
777
    // expect % and _, all chars should keep it literal mean.
778
13.7k
    for (size_t i = 0; i < pattern.size(); i++) {
779
12.0k
        char c = pattern[i];
780
12.0k
        if (c == '\\' && i + 1 < pattern.size()) {
781
549
            char next_c = pattern[i + 1];
782
549
            if (next_c == '%' || next_c == '_') {
783
                // convert "\%" and "\_" to literal "%" and "_"
784
222
                re_pattern->append(1, next_c);
785
222
                i++;
786
222
                continue;
787
327
            } else if (next_c == '\\') {
788
                // keep valid escape "\\"
789
308
                re_pattern->append("\\\\");
790
308
                i++;
791
308
                continue;
792
308
            }
793
549
        }
794
795
11.5k
        if (c == '%') {
796
1.94k
            re_pattern->append(".*");
797
9.59k
        } else if (c == '_') {
798
1.49k
            re_pattern->append(".");
799
8.10k
        } else {
800
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
801
8.11k
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
802
8.10k
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
803
8.10k
                c == '.' || c == '$' || c == '?') {
804
119
                re_pattern->append(1, '\\');
805
119
            }
806
8.10k
            re_pattern->append(1, c);
807
8.10k
        }
808
11.5k
    }
809
810
    // add $ to pattern tail to match line tail
811
1.71k
    if (!pattern.empty() && re_pattern->back() != '*') {
812
856
        re_pattern->append("$");
813
856
    }
814
1.71k
}
815
816
2.31k
void FunctionLike::remove_escape_character(std::string* search_string) {
817
2.31k
    std::string tmp_search_string;
818
2.31k
    tmp_search_string.swap(*search_string);
819
2.31k
    int64_t len = tmp_search_string.length();
820
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
821
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
822
11.6k
    for (int i = 0; i < len;) {
823
9.37k
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
824
9.37k
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
825
156
             tmp_search_string[i + 1] == '\\')) {
826
147
            search_string->append(1, tmp_search_string[i + 1]);
827
147
            i += 2;
828
9.23k
        } else {
829
9.23k
            search_string->append(1, tmp_search_string[i]);
830
9.23k
            i++;
831
9.23k
        }
832
9.37k
    }
833
2.31k
}
834
835
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
836
0
    if (!re.ok()) {
837
0
        return false;
838
0
    }
839
840
0
    std::vector<RE2::Arg> arguments;
841
0
    std::vector<RE2::Arg*> arguments_ptrs;
842
0
    std::size_t args_count = re.NumberOfCapturingGroups();
843
0
    arguments.resize(args_count);
844
0
    arguments_ptrs.resize(args_count);
845
0
    results.resize(args_count);
846
0
    for (std::size_t i = 0; i < args_count; ++i) {
847
0
        arguments[i] = &results[i];
848
0
        arguments_ptrs[i] = &arguments[i];
849
0
    }
850
851
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
852
0
}
853
854
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
855
0
    std::vector<std::string> results;
856
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
857
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
858
0
    if (re2_full_match(str, re, results)) {
859
0
        for (int i = 0; i < results.size(); ++i) {
860
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
861
0
        }
862
0
    } else {
863
0
        VLOG_DEBUG << "no match";
864
0
    }
865
0
}
866
867
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
868
                                                std::shared_ptr<LikeState>& state,
869
3.38k
                                                bool try_hyperscan) {
870
3.38k
    std::string pattern_str;
871
3.38k
    if (state->has_custom_escape) {
872
11
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
873
3.37k
    } else {
874
3.37k
        pattern_str = pattern.to_string();
875
3.37k
    }
876
3.38k
    state->search_state.pattern_str = pattern_str;
877
3.38k
    std::string search_string;
878
879
3.38k
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
880
157
        state->search_state.set_search_string("");
881
157
        state->function = constant_allpass_fn;
882
157
        state->scalar_function = constant_allpass_fn_scalar;
883
3.22k
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
884
195
        if (VLOG_DEBUG_IS_ON) {
885
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
886
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
887
0
        }
888
195
        remove_escape_character(&search_string);
889
195
        if (VLOG_DEBUG_IS_ON) {
890
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
891
0
                       << ", size: " << search_string.size();
892
0
        }
893
195
        state->search_state.set_search_string(search_string);
894
195
        state->function = constant_equals_fn;
895
195
        state->scalar_function = constant_equals_fn_scalar;
896
3.03k
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
897
729
        if (VLOG_DEBUG_IS_ON) {
898
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
899
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
900
0
        }
901
729
        remove_escape_character(&search_string);
902
729
        if (VLOG_DEBUG_IS_ON) {
903
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
904
0
                       << ", size: " << search_string.size();
905
0
        }
906
729
        state->search_state.set_search_string(search_string);
907
729
        state->function = constant_starts_with_fn;
908
729
        state->scalar_function = constant_starts_with_fn_scalar;
909
2.30k
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
910
233
        if (VLOG_DEBUG_IS_ON) {
911
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
912
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
913
0
        }
914
233
        remove_escape_character(&search_string);
915
233
        if (VLOG_DEBUG_IS_ON) {
916
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
917
0
                       << ", size: " << search_string.size();
918
0
        }
919
233
        state->search_state.set_search_string(search_string);
920
233
        state->function = constant_ends_with_fn;
921
233
        state->scalar_function = constant_ends_with_fn_scalar;
922
2.07k
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
923
872
        if (VLOG_DEBUG_IS_ON) {
924
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
925
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
926
0
        }
927
872
        remove_escape_character(&search_string);
928
872
        if (VLOG_DEBUG_IS_ON) {
929
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
930
0
                       << ", size: " << search_string.size();
931
0
        }
932
872
        state->search_state.set_search_string(search_string);
933
872
        state->function = constant_substring_fn;
934
872
        state->scalar_function = constant_substring_fn_scalar;
935
1.19k
    } else {
936
1.19k
        std::string re_pattern;
937
1.19k
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
938
1.19k
        if (VLOG_DEBUG_IS_ON) {
939
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
940
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
941
0
                       << ", size: " << re_pattern.size();
942
0
        }
943
944
1.19k
        hs_database_t* database = nullptr;
945
1.19k
        hs_scratch_t* scratch = nullptr;
946
1.19k
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
947
            // use hyperscan
948
1.10k
            state->search_state.hs_database.reset(database);
949
1.10k
            state->search_state.hs_scratch.reset(scratch);
950
1.10k
        } else {
951
            // fallback to re2
952
            // reset hs_database to nullptr to indicate not use hyperscan
953
97
            state->search_state.hs_database.reset();
954
97
            state->search_state.hs_scratch.reset();
955
956
97
            RE2::Options opts;
957
97
            opts.set_never_nl(false);
958
97
            opts.set_dot_nl(true);
959
97
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
960
97
            if (!state->search_state.regex->ok()) {
961
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
962
0
                                             pattern_str);
963
0
            }
964
97
        }
965
966
1.19k
        state->function = constant_regex_fn;
967
1.19k
        state->scalar_function = constant_regex_fn_scalar;
968
1.19k
    }
969
3.38k
    return Status::OK();
970
3.38k
}
971
972
4.46k
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
973
4.46k
    if (scope != FunctionContext::THREAD_LOCAL) {
974
1.00k
        return Status::OK();
975
1.00k
    }
976
3.46k
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
977
3.46k
    state->is_like_pattern = true;
978
3.46k
    state->function = like_fn;
979
3.46k
    state->scalar_function = like_fn_scalar;
980
3.46k
    if (context->is_col_constant(2)) {
981
11
        state->has_custom_escape = true;
982
11
        const auto escape_col = context->get_constant_col(2)->column_ptr;
983
11
        const auto& escape = escape_col->get_data_at(0);
984
11
        if (escape.size != 1) {
985
0
            return Status::InternalError("Escape character must be a single character, got: {}",
986
0
                                         escape.to_string());
987
0
        }
988
11
        state->escape_char = escape.data[0];
989
11
    }
990
3.46k
    if (context->is_col_constant(1)) {
991
3.21k
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
992
3.21k
        const auto& pattern = pattern_col->get_data_at(0);
993
3.21k
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
994
3.21k
    }
995
3.46k
    context->set_function_state(scope, state);
996
997
3.46k
    return Status::OK();
998
3.46k
}
999
1000
Status FunctionRegexpLike::open(FunctionContext* context,
1001
652
                                FunctionContext::FunctionStateScope scope) {
1002
652
    if (scope != FunctionContext::THREAD_LOCAL) {
1003
138
        return Status::OK();
1004
138
    }
1005
514
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
1006
514
    context->set_function_state(scope, state);
1007
514
    state->is_like_pattern = false;
1008
514
    state->function = regexp_fn;
1009
514
    state->scalar_function = regexp_fn_scalar;
1010
514
    if (context->is_col_constant(1)) {
1011
464
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
1012
464
        const auto& pattern = pattern_col->get_data_at(0);
1013
1014
464
        std::string pattern_str = pattern.to_string();
1015
464
        std::string search_string;
1016
464
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
1017
16
            state->search_state.set_search_string("");
1018
16
            state->function = constant_allpass_fn;
1019
16
            state->scalar_function = constant_allpass_fn_scalar;
1020
448
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
1021
7
            state->search_state.set_search_string(search_string);
1022
7
            state->function = constant_equals_fn;
1023
7
            state->scalar_function = constant_equals_fn_scalar;
1024
441
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
1025
90
            state->search_state.set_search_string(search_string);
1026
90
            state->function = constant_starts_with_fn;
1027
90
            state->scalar_function = constant_starts_with_fn_scalar;
1028
351
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
1029
75
            state->search_state.set_search_string(search_string);
1030
75
            state->function = constant_ends_with_fn;
1031
75
            state->scalar_function = constant_ends_with_fn_scalar;
1032
276
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
1033
62
            state->search_state.set_search_string(search_string);
1034
62
            state->function = constant_substring_fn;
1035
62
            state->scalar_function = constant_substring_fn_scalar;
1036
214
        } else {
1037
214
            hs_database_t* database = nullptr;
1038
214
            hs_scratch_t* scratch = nullptr;
1039
214
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
1040
                // use hyperscan
1041
196
                state->search_state.hs_database.reset(database);
1042
196
                state->search_state.hs_scratch.reset(scratch);
1043
196
            } else {
1044
                // fallback to re2
1045
                // reset hs_database to nullptr to indicate not use hyperscan
1046
18
                state->search_state.hs_database.reset();
1047
18
                state->search_state.hs_scratch.reset();
1048
18
                RE2::Options opts;
1049
18
                opts.set_never_nl(false);
1050
18
                opts.set_dot_nl(true);
1051
18
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1052
18
                if (!state->search_state.regex->ok()) {
1053
9
                    if (!context->state()->enable_extended_regex()) {
1054
1
                        return Status::InternalError(
1055
1
                                "Invalid regex expression: {}. Error: {}. If you need advanced "
1056
1
                                "regex features, try setting enable_extended_regex=true",
1057
1
                                pattern_str, state->search_state.regex->error());
1058
1
                    }
1059
1060
                    // RE2 failed, fallback to Boost.Regex
1061
                    // This handles advanced regex features like zero-width assertions
1062
8
                    state->search_state.regex.reset();
1063
8
                    try {
1064
8
                        state->search_state.boost_regex =
1065
8
                                std::make_unique<boost::regex>(pattern_str);
1066
8
                    } catch (const boost::regex_error& e) {
1067
0
                        return Status::InternalError("Invalid regex expression: {}. Error: {}",
1068
0
                                                     pattern_str, e.what());
1069
0
                    }
1070
8
                }
1071
18
            }
1072
213
            state->function = constant_regex_fn;
1073
213
            state->scalar_function = constant_regex_fn_scalar;
1074
213
        }
1075
464
    }
1076
513
    return Status::OK();
1077
514
}
1078
1079
8
void register_function_like(SimpleFunctionFactory& factory) {
1080
8
    factory.register_function<FunctionLike>();
1081
8
}
1082
1083
8
void register_function_regexp(SimpleFunctionFactory& factory) {
1084
8
    factory.register_function<FunctionRegexpLike>();
1085
8
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1086
8
}
1087
#include "common/compile_check_end.h"
1088
} // namespace doris