Coverage Report

Created: 2026-04-22 09:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "core/block/block.h"
31
#include "core/block/column_with_type_and_name.h"
32
#include "core/column/column.h"
33
#include "core/column/column_const.h"
34
#include "core/column/column_vector.h"
35
#include "core/string_ref.h"
36
#include "exprs/function/simple_function_factory.h"
37
38
namespace doris {
39
// A regex to match any regex pattern is equivalent to a substring search.
40
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
41
42
// A regex to match any regex pattern which is equivalent to matching a constant string
43
// at the end of the string values.
44
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
45
46
// A regex to match any regex pattern which is equivalent to matching a constant string
47
// at the end of the string values.
48
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
49
50
// A regex to match any regex pattern which is equivalent to a constant string match.
51
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
52
// A regex to match .*
53
static const RE2 ALLPASS_RE(R"((\.\*)+)");
54
55
// Like patterns
56
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
57
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
58
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
59
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
60
static const re2::RE2 LIKE_ALLPASS_RE("%+");
61
62
struct VectorAllpassSearchState : public VectorPatternSearchState {
63
436
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
64
65
    ~VectorAllpassSearchState() override = default;
66
67
409
    void like_pattern_match(const std::string& pattern_str) override {
68
409
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
69
26
            _search_strings->insert_default();
70
383
        } else {
71
383
            _pattern_matched = false;
72
383
        }
73
409
    }
74
75
32
    void regexp_pattern_match(const std::string& pattern_str) override {
76
32
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
77
0
            _search_strings->insert_default();
78
32
        } else {
79
32
            _pattern_matched = false;
80
32
        }
81
32
    }
82
};
83
84
struct VectorEqualSearchState : public VectorPatternSearchState {
85
436
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
86
87
    ~VectorEqualSearchState() override = default;
88
89
435
    void like_pattern_match(const std::string& pattern_str) override {
90
435
        _search_string.clear();
91
435
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
92
123
            FunctionLike::remove_escape_character(&_search_string);
93
123
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
94
312
        } else {
95
312
            _pattern_matched = false;
96
312
        }
97
435
    }
98
99
32
    void regexp_pattern_match(const std::string& pattern_str) override {
100
32
        _search_string.clear();
101
32
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
102
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
103
32
        } else {
104
32
            _pattern_matched = false;
105
32
        }
106
32
    }
107
};
108
109
struct VectorSubStringSearchState : public VectorPatternSearchState {
110
    VectorSubStringSearchState()
111
436
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
112
113
    ~VectorSubStringSearchState() override = default;
114
115
426
    void like_pattern_match(const std::string& pattern_str) override {
116
426
        _search_string.clear();
117
426
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
118
53
            FunctionLike::remove_escape_character(&_search_string);
119
53
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
120
373
        } else {
121
373
            _pattern_matched = false;
122
373
        }
123
426
    }
124
125
55
    void regexp_pattern_match(const std::string& pattern_str) override {
126
55
        _search_string.clear();
127
55
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
128
34
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
129
34
        } else {
130
21
            _pattern_matched = false;
131
21
        }
132
55
    }
133
};
134
135
struct VectorStartsWithSearchState : public VectorPatternSearchState {
136
    VectorStartsWithSearchState()
137
436
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
138
139
    ~VectorStartsWithSearchState() override = default;
140
141
416
    void like_pattern_match(const std::string& pattern_str) override {
142
416
        _search_string.clear();
143
416
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
144
75
            FunctionLike::remove_escape_character(&_search_string);
145
75
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
146
341
        } else {
147
341
            _pattern_matched = false;
148
341
        }
149
416
    }
150
151
50
    void regexp_pattern_match(const std::string& pattern_str) override {
152
50
        _search_string.clear();
153
50
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
154
23
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
155
27
        } else {
156
27
            _pattern_matched = false;
157
27
        }
158
50
    }
159
};
160
161
struct VectorEndsWithSearchState : public VectorPatternSearchState {
162
436
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
163
164
    ~VectorEndsWithSearchState() override = default;
165
166
415
    void like_pattern_match(const std::string& pattern_str) override {
167
415
        _search_string.clear();
168
415
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
169
42
            FunctionLike::remove_escape_character(&_search_string);
170
42
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
171
373
        } else {
172
373
            _pattern_matched = false;
173
373
        }
174
415
    }
175
176
38
    void regexp_pattern_match(const std::string& pattern_str) override {
177
38
        _search_string.clear();
178
38
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
179
9
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
180
29
        } else {
181
29
            _pattern_matched = false;
182
29
        }
183
38
    }
184
};
185
186
205
Status LikeSearchState::clone(LikeSearchState& cloned) {
187
205
    cloned.set_search_string(search_string);
188
189
205
    std::string re_pattern;
190
205
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
191
205
    if (hs_database) { // use hyperscan
192
100
        hs_database_t* database = nullptr;
193
100
        hs_scratch_t* scratch = nullptr;
194
100
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
195
196
100
        cloned.hs_database.reset(database);
197
100
        cloned.hs_scratch.reset(scratch);
198
105
    } else { // fallback to re2
199
105
        cloned.hs_database.reset();
200
105
        cloned.hs_scratch.reset();
201
202
105
        RE2::Options opts;
203
105
        opts.set_never_nl(false);
204
105
        opts.set_dot_nl(true);
205
105
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
206
105
        if (!cloned.regex->ok()) {
207
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
208
0
        }
209
105
    }
210
211
205
    return Status::OK();
212
205
}
213
214
Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals,
215
                                             const StringRef& pattern,
216
36
                                             ColumnUInt8::Container& result) {
217
36
    memset(result.data(), 1, vals.size());
218
36
    return Status::OK();
219
36
}
220
221
Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state,
222
                                                    const StringRef& val, const StringRef& pattern,
223
40
                                                    unsigned char* result) {
224
40
    *result = 1;
225
40
    return Status::OK();
226
40
}
227
228
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
229
                                           const ColumnString& search_strings,
230
21
                                           ColumnUInt8::Container& result) {
231
21
    DCHECK(vals.size() == search_strings.size());
232
21
    DCHECK(vals.size() == result.size());
233
21
    memset(result.data(), 1, vals.size());
234
21
    return Status::OK();
235
21
}
236
237
Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state,
238
                                                 const ColumnString& val, const StringRef& pattern,
239
267
                                                 ColumnUInt8::Container& result) {
240
267
    auto sz = val.size();
241
126k
    for (size_t i = 0; i < sz; i++) {
242
126k
        const auto& str_ref = val.get_data_at(i);
243
126k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
244
127k
                    str_ref.start_with(state->search_string_sv);
245
126k
    }
246
267
    return Status::OK();
247
267
}
248
249
Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state,
250
                                                        const StringRef& val,
251
                                                        const StringRef& pattern,
252
20.0k
                                                        unsigned char* result) {
253
20.0k
    *result = (val.size >= state->search_string_sv.size) &&
254
20.0k
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
255
20.0k
    return Status::OK();
256
20.0k
}
257
258
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
259
                                               const ColumnString& search_strings,
260
68
                                               ColumnUInt8::Container& result) {
261
68
    DCHECK(vals.size() == search_strings.size());
262
68
    DCHECK(vals.size() == result.size());
263
68
    auto sz = vals.size();
264
158
    for (size_t i = 0; i < sz; ++i) {
265
90
        const auto& str_sv = vals.get_data_at(i);
266
90
        const auto& search_string_sv = search_strings.get_data_at(i);
267
90
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
268
90
    }
269
68
    return Status::OK();
270
68
}
271
272
Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state,
273
                                               const ColumnString& val, const StringRef& pattern,
274
172
                                               ColumnUInt8::Container& result) {
275
172
    auto sz = val.size();
276
22.0k
    for (size_t i = 0; i < sz; i++) {
277
21.8k
        const auto& str_ref = val.get_data_at(i);
278
21.8k
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
279
21.8k
                    str_ref.end_with(state->search_string_sv);
280
21.8k
    }
281
172
    return Status::OK();
282
172
}
283
284
Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state,
285
                                                      const StringRef& val,
286
                                                      const StringRef& pattern,
287
39
                                                      unsigned char* result) {
288
39
    *result = (val.size >= state->search_string_sv.size) &&
289
39
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
290
34
                                                        state->search_string_sv.size));
291
39
    return Status::OK();
292
39
}
293
294
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
295
                                             const ColumnString& search_strings,
296
34
                                             ColumnUInt8::Container& result) {
297
34
    DCHECK(vals.size() == search_strings.size());
298
34
    DCHECK(vals.size() == result.size());
299
34
    auto sz = vals.size();
300
78
    for (size_t i = 0; i < sz; ++i) {
301
44
        const auto& str_sv = vals.get_data_at(i);
302
44
        const auto& search_string_sv = search_strings.get_data_at(i);
303
44
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
304
44
    }
305
34
    return Status::OK();
306
34
}
307
308
Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
309
                                            const StringRef& pattern,
310
50
                                            ColumnUInt8::Container& result) {
311
50
    auto sz = val.size();
312
127
    for (size_t i = 0; i < sz; i++) {
313
77
        result[i] = (val.get_data_at(i) == state->search_string_sv);
314
77
    }
315
50
    return Status::OK();
316
50
}
317
318
Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state,
319
                                                   const StringRef& val, const StringRef& pattern,
320
150
                                                   unsigned char* result) {
321
150
    *result = (val == state->search_string_sv);
322
150
    return Status::OK();
323
150
}
324
325
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
326
                                          const ColumnString& search_strings,
327
92
                                          ColumnUInt8::Container& result) {
328
92
    DCHECK(vals.size() == search_strings.size());
329
92
    DCHECK(vals.size() == result.size());
330
92
    auto sz = vals.size();
331
205
    for (size_t i = 0; i < sz; ++i) {
332
113
        const auto& str_sv = vals.get_data_at(i);
333
113
        const auto& search_string_sv = search_strings.get_data_at(i);
334
113
        result[i] = str_sv == search_string_sv;
335
113
    }
336
92
    return Status::OK();
337
92
}
338
339
Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state,
340
                                               const ColumnString& val, const StringRef& pattern,
341
0
                                               ColumnUInt8::Container& result) {
342
0
    auto sz = val.size();
343
0
    for (size_t i = 0; i < sz; i++) {
344
0
        if (state->search_string_sv.size == 0) {
345
0
            result[i] = true;
346
0
            continue;
347
0
        }
348
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
349
0
    }
350
0
    return Status::OK();
351
0
}
352
353
Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state,
354
                                                      const StringRef& val,
355
                                                      const StringRef& pattern,
356
213
                                                      unsigned char* result) {
357
213
    if (state->search_string_sv.size == 0) {
358
0
        *result = true;
359
0
        return Status::OK();
360
0
    }
361
213
    *result = state->substring_pattern.search(val) != -1;
362
213
    return Status::OK();
363
213
}
364
365
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
366
                                             const ColumnString& search_strings,
367
42
                                             ColumnUInt8::Container& result) {
368
42
    DCHECK(vals.size() == search_strings.size());
369
42
    DCHECK(vals.size() == result.size());
370
42
    auto sz = vals.size();
371
108
    for (size_t i = 0; i < sz; ++i) {
372
66
        const auto& str_sv = vals.get_data_at(i);
373
66
        const auto& search_string_sv = search_strings.get_data_at(i);
374
66
        if (search_string_sv.size == 0) {
375
2
            result[i] = true;
376
2
            continue;
377
2
        }
378
64
        doris::StringSearch substring_search(&search_string_sv);
379
64
        result[i] = substring_search.search(str_sv) != -1;
380
64
    }
381
42
    return Status::OK();
382
42
}
383
384
Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state,
385
                                                  const StringRef& val, const StringRef& pattern,
386
1.93k
                                                  unsigned char* result) {
387
1.93k
    if (state->hs_database) { // use hyperscan
388
1.75k
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
389
1.75k
                           state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
390
1.75k
                           (void*)result);
391
1.75k
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
392
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
393
0
        }
394
1.75k
    } else if (state->boost_regex) { // use boost::regex for advanced features
395
4
        *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex);
396
178
    } else { // fallback to re2
397
178
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
398
178
    }
399
400
1.93k
    return Status::OK();
401
1.93k
}
402
403
Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
404
104
                                          const StringRef& pattern, unsigned char* result) {
405
104
    RE2::Options opts;
406
104
    opts.set_never_nl(false);
407
104
    opts.set_dot_nl(true);
408
104
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
409
104
    if (re.ok()) {
410
104
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
411
104
    } else {
412
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
413
0
    }
414
415
104
    return Status::OK();
416
104
}
417
418
Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
419
                                           const StringRef& pattern,
420
443
                                           ColumnUInt8::Container& result) {
421
443
    auto sz = val.size();
422
443
    if (state->hs_database) { // use hyperscan
423
891k
        for (size_t i = 0; i < sz; i++) {
424
890k
            const auto& str_ref = val.get_data_at(i);
425
890k
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
426
890k
                               state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
427
890k
                               (void*)(result.data() + i));
428
890k
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
429
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
430
0
            }
431
890k
        }
432
440
    } else if (state->boost_regex) { // use boost::regex for advanced features
433
0
        for (size_t i = 0; i < sz; i++) {
434
0
            const auto& str_ref = val.get_data_at(i);
435
0
            *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size,
436
0
                                                       *state->boost_regex);
437
0
        }
438
3
    } else { // fallback to re2
439
7
        for (size_t i = 0; i < sz; i++) {
440
4
            const auto& str_ref = val.get_data_at(i);
441
4
            *(result.data() + i) =
442
4
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
443
4
        }
444
3
    }
445
446
443
    return Status::OK();
447
443
}
448
449
Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val,
450
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
451
0
    std::string re_pattern(pattern.data, pattern.size);
452
453
0
    hs_database_t* database = nullptr;
454
0
    hs_scratch_t* scratch = nullptr;
455
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
456
0
        auto sz = val.size();
457
0
        for (size_t i = 0; i < sz; i++) {
458
0
            const auto& str_ref = val.get_data_at(i);
459
0
            auto ret =
460
0
                    hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
461
0
                            doris::LikeSearchState::hs_match_handler, (void*)(result.data() + i));
462
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
463
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
464
0
            }
465
0
        }
466
467
0
        hs_free_scratch(scratch);
468
0
        hs_free_database(database);
469
0
    } else { // fallback to re2
470
0
        RE2::Options opts;
471
0
        opts.set_never_nl(false);
472
0
        opts.set_dot_nl(true);
473
0
        re2::RE2 re(re_pattern, opts);
474
0
        if (re.ok()) {
475
0
            auto sz = val.size();
476
0
            for (size_t i = 0; i < sz; i++) {
477
0
                const auto& str_ref = val.get_data_at(i);
478
0
                *(result.data() + i) =
479
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
480
0
            }
481
0
        } else {
482
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
483
0
        }
484
0
    }
485
486
0
    return Status::OK();
487
0
}
488
489
// hyperscan compile expression to database and allocate scratch space
490
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
491
1.34k
                                    hs_database_t** database, hs_scratch_t** scratch) {
492
1.34k
    hs_compile_error_t* compile_err;
493
1.34k
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
494
1.34k
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
495
496
1.34k
    if (res != HS_SUCCESS) {
497
19
        *database = nullptr;
498
19
        std::string error_message = compile_err->message;
499
19
        hs_free_compile_error(compile_err);
500
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
501
19
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
502
19
    }
503
1.33k
    hs_free_compile_error(compile_err);
504
505
1.33k
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
506
0
        hs_free_database(*database);
507
0
        *database = nullptr;
508
0
        *scratch = nullptr;
509
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
510
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
511
0
    }
512
513
1.33k
    return Status::OK();
514
1.33k
}
515
516
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
517
                                      const ColumnNumbers& arguments, uint32_t result,
518
1.66k
                                      size_t input_rows_count) const {
519
1.66k
    const auto values_col =
520
1.66k
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
521
1.66k
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
522
523
1.66k
    if (!values) {
524
0
        return Status::InternalError("Not supported input arguments types");
525
0
    }
526
    // result column
527
1.66k
    auto res = ColumnUInt8::create();
528
1.66k
    ColumnUInt8::Container& vec_res = res->get_data();
529
    // set default value to 0, and match functions only need to set 1/true
530
1.66k
    vec_res.resize_fill(input_rows_count);
531
1.66k
    auto* state = reinterpret_cast<LikeState*>(
532
1.66k
            context->get_function_state(FunctionContext::THREAD_LOCAL));
533
    // for constant_substring_fn, use long run length search for performance
534
1.66k
    if (constant_substring_fn ==
535
1.66k
        *(state->function
536
1.66k
                  .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&,
537
1.66k
                                            const StringRef&, ColumnUInt8::Container&)>())) {
538
261
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
539
261
                                          &state->search_state));
540
1.40k
    } else {
541
1.40k
        const auto pattern_col = block.get_by_position(arguments[1]).column;
542
1.40k
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
543
436
            RETURN_IF_ERROR(
544
436
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
545
968
        } else if (const auto* const_patterns =
546
968
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
547
968
            const auto& pattern_val = const_patterns->get_data_at(0);
548
968
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
549
968
                                         &state->search_state));
550
968
        } else {
551
0
            return Status::InternalError("Not supported input arguments types");
552
0
        }
553
1.40k
    }
554
1.66k
    block.replace_by_position(result, std::move(res));
555
1.66k
    return Status::OK();
556
1.66k
}
557
558
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
559
                                           const ColumnString::Offsets& value_offsets,
560
                                           ColumnUInt8::Container& result,
561
261
                                           LikeSearchState* search_state) const {
562
    // treat continuous multi string data as a long string data
563
261
    const UInt8* begin = values.data();
564
261
    const UInt8* end = begin + values.size();
565
261
    const UInt8* pos = begin;
566
567
    /// Current index in the array of strings.
568
261
    size_t i = 0;
569
261
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
570
571
    /// We will search for the next occurrence in all strings at once.
572
6.90k
    while (pos < end) {
573
        // search return matched substring start offset
574
6.77k
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
575
6.77k
        if (pos >= end) {
576
133
            break;
577
133
        }
578
579
        /// Determine which index it refers to.
580
        /// begin + value_offsets[i] is the start offset of string at i+1
581
97.8k
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
582
91.2k
            ++i;
583
91.2k
        }
584
585
        /// We check that the entry does not pass through the boundaries of strings.
586
6.64k
        if (pos + needle_size <= begin + value_offsets[i]) {
587
5.62k
            result[i] = 1;
588
5.62k
        }
589
590
        // move to next string offset
591
6.64k
        pos = begin + value_offsets[i];
592
6.64k
        ++i;
593
6.64k
    }
594
595
261
    return Status::OK();
596
261
}
597
598
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
599
                                      ColumnUInt8::Container& result, const LikeFn& function,
600
968
                                      LikeSearchState* search_state) const {
601
968
    RETURN_IF_ERROR((function)(search_state, values,
602
968
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
603
968
    return Status::OK();
604
968
}
605
606
template <bool LIKE_PATTERN>
607
436
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
436
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
436
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
436
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
436
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
436
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
436
    size_t size = patterns.size();
614
615
1.00k
    for (size_t i = 0; i < size; ++i) {
616
576
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
576
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
576
            !ends_with_state->_pattern_matched) {
619
12
            return nullptr;
620
12
        }
621
564
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
564
        if (allpass_state->_pattern_matched) {
623
441
            if constexpr (LIKE_PATTERN) {
624
409
                allpass_state->like_pattern_match(pattern_str);
625
409
            } else {
626
32
                allpass_state->regexp_pattern_match(pattern_str);
627
32
            }
628
441
        }
629
564
        if (equal_state->_pattern_matched) {
630
467
            if constexpr (LIKE_PATTERN) {
631
435
                equal_state->like_pattern_match(pattern_str);
632
435
            } else {
633
32
                equal_state->regexp_pattern_match(pattern_str);
634
32
            }
635
467
        }
636
564
        if (substring_state->_pattern_matched) {
637
481
            if constexpr (LIKE_PATTERN) {
638
426
                substring_state->like_pattern_match(pattern_str);
639
426
            } else {
640
55
                substring_state->regexp_pattern_match(pattern_str);
641
55
            }
642
481
        }
643
564
        if (starts_with_state->_pattern_matched) {
644
466
            if constexpr (LIKE_PATTERN) {
645
416
                starts_with_state->like_pattern_match(pattern_str);
646
416
            } else {
647
50
                starts_with_state->regexp_pattern_match(pattern_str);
648
50
            }
649
466
        }
650
564
        if (ends_with_state->_pattern_matched) {
651
453
            if constexpr (LIKE_PATTERN) {
652
415
                ends_with_state->like_pattern_match(pattern_str);
653
415
            } else {
654
38
                ends_with_state->regexp_pattern_match(pattern_str);
655
38
            }
656
453
        }
657
564
    }
658
659
424
    if (allpass_state->_pattern_matched) {
660
21
        return allpass_state;
661
403
    } else if (equal_state->_pattern_matched) {
662
92
        return equal_state;
663
311
    } else if (substring_state->_pattern_matched) {
664
42
        return substring_state;
665
269
    } else if (starts_with_state->_pattern_matched) {
666
68
        return starts_with_state;
667
201
    } else if (ends_with_state->_pattern_matched) {
668
34
        return ends_with_state;
669
167
    } else {
670
167
        return nullptr;
671
167
    }
672
424
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
607
404
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
404
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
404
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
404
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
404
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
404
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
404
    size_t size = patterns.size();
614
615
889
    for (size_t i = 0; i < size; ++i) {
616
497
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
497
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
497
            !ends_with_state->_pattern_matched) {
619
12
            return nullptr;
620
12
        }
621
485
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
485
        if (allpass_state->_pattern_matched) {
623
409
            if constexpr (LIKE_PATTERN) {
624
409
                allpass_state->like_pattern_match(pattern_str);
625
            } else {
626
                allpass_state->regexp_pattern_match(pattern_str);
627
            }
628
409
        }
629
485
        if (equal_state->_pattern_matched) {
630
435
            if constexpr (LIKE_PATTERN) {
631
435
                equal_state->like_pattern_match(pattern_str);
632
            } else {
633
                equal_state->regexp_pattern_match(pattern_str);
634
            }
635
435
        }
636
485
        if (substring_state->_pattern_matched) {
637
426
            if constexpr (LIKE_PATTERN) {
638
426
                substring_state->like_pattern_match(pattern_str);
639
            } else {
640
                substring_state->regexp_pattern_match(pattern_str);
641
            }
642
426
        }
643
485
        if (starts_with_state->_pattern_matched) {
644
416
            if constexpr (LIKE_PATTERN) {
645
416
                starts_with_state->like_pattern_match(pattern_str);
646
            } else {
647
                starts_with_state->regexp_pattern_match(pattern_str);
648
            }
649
416
        }
650
485
        if (ends_with_state->_pattern_matched) {
651
415
            if constexpr (LIKE_PATTERN) {
652
415
                ends_with_state->like_pattern_match(pattern_str);
653
            } else {
654
                ends_with_state->regexp_pattern_match(pattern_str);
655
            }
656
415
        }
657
485
    }
658
659
392
    if (allpass_state->_pattern_matched) {
660
21
        return allpass_state;
661
371
    } else if (equal_state->_pattern_matched) {
662
92
        return equal_state;
663
279
    } else if (substring_state->_pattern_matched) {
664
31
        return substring_state;
665
248
    } else if (starts_with_state->_pattern_matched) {
666
63
        return starts_with_state;
667
185
    } else if (ends_with_state->_pattern_matched) {
668
31
        return ends_with_state;
669
154
    } else {
670
154
        return nullptr;
671
154
    }
672
392
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
607
32
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
32
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
32
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
32
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
32
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
32
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
32
    size_t size = patterns.size();
614
615
111
    for (size_t i = 0; i < size; ++i) {
616
79
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
79
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
79
            !ends_with_state->_pattern_matched) {
619
0
            return nullptr;
620
0
        }
621
79
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
79
        if (allpass_state->_pattern_matched) {
623
            if constexpr (LIKE_PATTERN) {
624
                allpass_state->like_pattern_match(pattern_str);
625
32
            } else {
626
32
                allpass_state->regexp_pattern_match(pattern_str);
627
32
            }
628
32
        }
629
79
        if (equal_state->_pattern_matched) {
630
            if constexpr (LIKE_PATTERN) {
631
                equal_state->like_pattern_match(pattern_str);
632
32
            } else {
633
32
                equal_state->regexp_pattern_match(pattern_str);
634
32
            }
635
32
        }
636
79
        if (substring_state->_pattern_matched) {
637
            if constexpr (LIKE_PATTERN) {
638
                substring_state->like_pattern_match(pattern_str);
639
55
            } else {
640
55
                substring_state->regexp_pattern_match(pattern_str);
641
55
            }
642
55
        }
643
79
        if (starts_with_state->_pattern_matched) {
644
            if constexpr (LIKE_PATTERN) {
645
                starts_with_state->like_pattern_match(pattern_str);
646
50
            } else {
647
50
                starts_with_state->regexp_pattern_match(pattern_str);
648
50
            }
649
50
        }
650
79
        if (ends_with_state->_pattern_matched) {
651
            if constexpr (LIKE_PATTERN) {
652
                ends_with_state->like_pattern_match(pattern_str);
653
38
            } else {
654
38
                ends_with_state->regexp_pattern_match(pattern_str);
655
38
            }
656
38
        }
657
79
    }
658
659
32
    if (allpass_state->_pattern_matched) {
660
0
        return allpass_state;
661
32
    } else if (equal_state->_pattern_matched) {
662
0
        return equal_state;
663
32
    } else if (substring_state->_pattern_matched) {
664
11
        return substring_state;
665
21
    } else if (starts_with_state->_pattern_matched) {
666
5
        return starts_with_state;
667
16
    } else if (ends_with_state->_pattern_matched) {
668
3
        return ends_with_state;
669
13
    } else {
670
13
        return nullptr;
671
13
    }
672
32
}
673
674
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
675
                                          ColumnUInt8::Container& result, LikeState* state,
676
436
                                          size_t input_rows_count) const {
677
436
    ColumnString::MutablePtr replaced_patterns;
678
436
    VPatternSearchStateSPtr vector_search_state;
679
436
    if (state->is_like_pattern) {
680
404
        if (state->has_custom_escape) {
681
5
            replaced_patterns = ColumnString::create();
682
10
            for (int i = 0; i < input_rows_count; ++i) {
683
5
                std::string val =
684
5
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
685
5
                replaced_patterns->insert_data(val.c_str(), val.size());
686
5
            }
687
5
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
688
399
        } else {
689
399
            vector_search_state = pattern_type_recognition<true>(patterns);
690
399
        }
691
404
    } else {
692
32
        vector_search_state = pattern_type_recognition<false>(patterns);
693
32
    }
694
695
436
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
696
697
436
    if (vector_search_state == nullptr) {
698
        // pattern type recognition failed, use default case
699
524
        for (int i = 0; i < input_rows_count; ++i) {
700
345
            const auto pattern_val = real_pattern.get_data_at(i);
701
345
            const auto value_val = values.get_data_at(i);
702
345
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
703
345
                                                     &result[i]));
704
345
        }
705
179
        return Status::OK();
706
179
    }
707
257
    const auto* search_strings =
708
257
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
709
257
    return (vector_search_state->_vector_function)(values, *search_strings, result);
710
436
}
711
712
Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val,
713
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
714
0
    std::string re_pattern;
715
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
716
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
717
0
}
718
719
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val,
720
243
                                    const StringRef& pattern, unsigned char* result) {
721
    // Try to use fast path to avoid regex compilation
722
243
    std::string search_string;
723
243
    LikeFastPath fast_path = extract_like_fast_path(pattern.data, pattern.size, search_string);
724
725
243
    switch (fast_path) {
726
12
    case LikeFastPath::ALLPASS:
727
12
        *result = 1;
728
12
        return Status::OK();
729
61
    case LikeFastPath::EQUALS:
730
61
        *result = (val.size == search_string.size() &&
731
61
                   (search_string.empty() ||
732
33
                    memcmp(val.data, search_string.data(), search_string.size()) == 0));
733
61
        return Status::OK();
734
22
    case LikeFastPath::STARTS_WITH:
735
22
        *result = (val.size >= search_string.size() &&
736
22
                   memcmp(val.data, search_string.data(), search_string.size()) == 0);
737
22
        return Status::OK();
738
16
    case LikeFastPath::ENDS_WITH:
739
16
        *result = (val.size >= search_string.size() &&
740
16
                   memcmp(val.data + val.size - search_string.size(), search_string.data(),
741
15
                          search_string.size()) == 0);
742
16
        return Status::OK();
743
34
    case LikeFastPath::SUBSTRING:
744
34
        if (search_string.empty()) {
745
0
            *result = 1;
746
34
        } else {
747
            // Use memmem for substring search
748
34
            *result = (memmem(val.data, val.size, search_string.data(), search_string.size()) !=
749
34
                       nullptr);
750
34
        }
751
34
        return Status::OK();
752
98
    case LikeFastPath::REGEX:
753
98
    default:
754
        // Fall back to regex matching
755
98
        std::string re_pattern;
756
98
        convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
757
98
        return regexp_fn_scalar(state, StringRef(val.data, val.size),
758
98
                                {re_pattern.c_str(), re_pattern.size()}, result);
759
243
    }
760
243
}
761
762
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
763
1.39k
                                        std::string* re_pattern) {
764
1.39k
    re_pattern->clear();
765
766
1.39k
    if (pattern.empty()) {
767
0
        re_pattern->append("^$");
768
0
        return;
769
0
    }
770
771
    // add ^ to pattern head to match line head
772
1.39k
    if (!pattern.empty() && pattern[0] != '%') {
773
693
        re_pattern->append("^");
774
693
    }
775
776
    // expect % and _, all chars should keep it literal mean.
777
14.8k
    for (size_t i = 0; i < pattern.size(); i++) {
778
13.4k
        char c = pattern[i];
779
13.4k
        if (c == '\\' && i + 1 < pattern.size()) {
780
383
            char next_c = pattern[i + 1];
781
383
            if (next_c == '%' || next_c == '_') {
782
                // convert "\%" and "\_" to literal "%" and "_"
783
184
                re_pattern->append(1, next_c);
784
184
                i++;
785
184
                continue;
786
199
            } else if (next_c == '\\') {
787
                // keep valid escape "\\"
788
180
                re_pattern->append("\\\\");
789
180
                i++;
790
180
                continue;
791
180
            }
792
383
        }
793
794
13.0k
        if (c == '%') {
795
2.15k
            re_pattern->append(".*");
796
10.8k
        } else if (c == '_') {
797
1.10k
            re_pattern->append(".");
798
9.78k
        } else {
799
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
800
9.78k
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
801
9.78k
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
802
9.78k
                c == '.' || c == '$' || c == '?') {
803
79
                re_pattern->append(1, '\\');
804
79
            }
805
9.78k
            re_pattern->append(1, c);
806
9.78k
        }
807
13.0k
    }
808
809
    // add $ to pattern tail to match line tail
810
1.39k
    if (!pattern.empty() && re_pattern->back() != '*') {
811
536
        re_pattern->append("$");
812
536
    }
813
1.39k
}
814
815
2.22k
void FunctionLike::remove_escape_character(std::string* search_string) {
816
2.22k
    std::string tmp_search_string;
817
2.22k
    tmp_search_string.swap(*search_string);
818
2.22k
    int64_t len = tmp_search_string.length();
819
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
820
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
821
11.0k
    for (int i = 0; i < len;) {
822
8.82k
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
823
8.82k
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
824
132
             tmp_search_string[i + 1] == '\\')) {
825
118
            search_string->append(1, tmp_search_string[i + 1]);
826
118
            i += 2;
827
8.70k
        } else {
828
8.70k
            search_string->append(1, tmp_search_string[i]);
829
8.70k
            i++;
830
8.70k
        }
831
8.82k
    }
832
2.22k
}
833
834
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
835
0
    if (!re.ok()) {
836
0
        return false;
837
0
    }
838
839
0
    std::vector<RE2::Arg> arguments;
840
0
    std::vector<RE2::Arg*> arguments_ptrs;
841
0
    std::size_t args_count = re.NumberOfCapturingGroups();
842
0
    arguments.resize(args_count);
843
0
    arguments_ptrs.resize(args_count);
844
0
    results.resize(args_count);
845
0
    for (std::size_t i = 0; i < args_count; ++i) {
846
0
        arguments[i] = &results[i];
847
0
        arguments_ptrs[i] = &arguments[i];
848
0
    }
849
850
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
851
0
}
852
853
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
854
0
    std::vector<std::string> results;
855
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
856
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
857
0
    if (re2_full_match(str, re, results)) {
858
0
        for (int i = 0; i < results.size(); ++i) {
859
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
860
0
        }
861
0
    } else {
862
0
        VLOG_DEBUG << "no match";
863
0
    }
864
0
}
865
866
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
867
                                                std::shared_ptr<LikeState>& state,
868
3.12k
                                                bool try_hyperscan) {
869
3.12k
    std::string pattern_str;
870
3.12k
    if (state->has_custom_escape) {
871
11
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
872
3.11k
    } else {
873
3.11k
        pattern_str = pattern.to_string();
874
3.11k
    }
875
3.12k
    state->search_state.pattern_str = pattern_str;
876
3.12k
    std::string search_string;
877
878
3.12k
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
879
88
        state->search_state.set_search_string("");
880
88
        state->function = constant_allpass_fn;
881
88
        state->scalar_function = constant_allpass_fn_scalar;
882
3.03k
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
883
189
        if (VLOG_DEBUG_IS_ON) {
884
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
885
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
886
0
        }
887
189
        remove_escape_character(&search_string);
888
189
        if (VLOG_DEBUG_IS_ON) {
889
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
890
0
                       << ", size: " << search_string.size();
891
0
        }
892
189
        state->search_state.set_search_string(search_string);
893
189
        state->function = constant_equals_fn;
894
189
        state->scalar_function = constant_equals_fn_scalar;
895
2.84k
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
896
617
        if (VLOG_DEBUG_IS_ON) {
897
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
898
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
899
0
        }
900
617
        remove_escape_character(&search_string);
901
617
        if (VLOG_DEBUG_IS_ON) {
902
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
903
0
                       << ", size: " << search_string.size();
904
0
        }
905
617
        state->search_state.set_search_string(search_string);
906
617
        state->function = constant_starts_with_fn;
907
617
        state->scalar_function = constant_starts_with_fn_scalar;
908
2.22k
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
909
320
        if (VLOG_DEBUG_IS_ON) {
910
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
911
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
912
0
        }
913
320
        remove_escape_character(&search_string);
914
320
        if (VLOG_DEBUG_IS_ON) {
915
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
916
0
                       << ", size: " << search_string.size();
917
0
        }
918
320
        state->search_state.set_search_string(search_string);
919
320
        state->function = constant_ends_with_fn;
920
320
        state->scalar_function = constant_ends_with_fn_scalar;
921
1.90k
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
922
810
        if (VLOG_DEBUG_IS_ON) {
923
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
924
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
925
0
        }
926
810
        remove_escape_character(&search_string);
927
810
        if (VLOG_DEBUG_IS_ON) {
928
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
929
0
                       << ", size: " << search_string.size();
930
0
        }
931
810
        state->search_state.set_search_string(search_string);
932
810
        state->function = constant_substring_fn;
933
810
        state->scalar_function = constant_substring_fn_scalar;
934
1.09k
    } else {
935
1.09k
        std::string re_pattern;
936
1.09k
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
937
1.09k
        if (VLOG_DEBUG_IS_ON) {
938
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
939
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
940
0
                       << ", size: " << re_pattern.size();
941
0
        }
942
943
1.09k
        hs_database_t* database = nullptr;
944
1.09k
        hs_scratch_t* scratch = nullptr;
945
1.09k
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
946
            // use hyperscan
947
996
            state->search_state.hs_database.reset(database);
948
996
            state->search_state.hs_scratch.reset(scratch);
949
996
        } else {
950
            // fallback to re2
951
            // reset hs_database to nullptr to indicate not use hyperscan
952
101
            state->search_state.hs_database.reset();
953
101
            state->search_state.hs_scratch.reset();
954
955
101
            RE2::Options opts;
956
101
            opts.set_never_nl(false);
957
101
            opts.set_dot_nl(true);
958
101
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
959
101
            if (!state->search_state.regex->ok()) {
960
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
961
0
                                             pattern_str);
962
0
            }
963
101
        }
964
965
1.09k
        state->function = constant_regex_fn;
966
1.09k
        state->scalar_function = constant_regex_fn_scalar;
967
1.09k
    }
968
3.12k
    return Status::OK();
969
3.12k
}
970
971
4.14k
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
972
4.14k
    if (scope != FunctionContext::THREAD_LOCAL) {
973
922
        return Status::OK();
974
922
    }
975
3.22k
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
976
3.22k
    state->is_like_pattern = true;
977
3.22k
    state->function = like_fn;
978
3.22k
    state->scalar_function = like_fn_scalar;
979
3.22k
    if (context->is_col_constant(2)) {
980
11
        state->has_custom_escape = true;
981
11
        const auto escape_col = context->get_constant_col(2)->column_ptr;
982
11
        const auto& escape = escape_col->get_data_at(0);
983
11
        if (escape.size != 1) {
984
0
            return Status::InternalError("Escape character must be a single character, got: {}",
985
0
                                         escape.to_string());
986
0
        }
987
11
        state->escape_char = escape.data[0];
988
11
    }
989
3.22k
    if (context->is_col_constant(1)) {
990
2.95k
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
991
2.95k
        const auto& pattern = pattern_col->get_data_at(0);
992
2.95k
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
993
2.95k
    }
994
3.22k
    context->set_function_state(scope, state);
995
996
3.22k
    return Status::OK();
997
3.22k
}
998
999
Status FunctionRegexpLike::open(FunctionContext* context,
1000
690
                                FunctionContext::FunctionStateScope scope) {
1001
690
    if (scope != FunctionContext::THREAD_LOCAL) {
1002
129
        return Status::OK();
1003
129
    }
1004
561
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
1005
561
    context->set_function_state(scope, state);
1006
561
    state->is_like_pattern = false;
1007
561
    state->function = regexp_fn;
1008
561
    state->scalar_function = regexp_fn_scalar;
1009
561
    if (context->is_col_constant(1)) {
1010
505
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
1011
505
        const auto& pattern = pattern_col->get_data_at(0);
1012
1013
505
        std::string pattern_str = pattern.to_string();
1014
505
        std::string search_string;
1015
505
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
1016
24
            state->search_state.set_search_string("");
1017
24
            state->function = constant_allpass_fn;
1018
24
            state->scalar_function = constant_allpass_fn_scalar;
1019
481
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
1020
7
            state->search_state.set_search_string(search_string);
1021
7
            state->function = constant_equals_fn;
1022
7
            state->scalar_function = constant_equals_fn_scalar;
1023
474
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
1024
72
            state->search_state.set_search_string(search_string);
1025
72
            state->function = constant_starts_with_fn;
1026
72
            state->scalar_function = constant_starts_with_fn_scalar;
1027
402
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
1028
59
            state->search_state.set_search_string(search_string);
1029
59
            state->function = constant_ends_with_fn;
1030
59
            state->scalar_function = constant_ends_with_fn_scalar;
1031
343
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
1032
87
            state->search_state.set_search_string(search_string);
1033
87
            state->function = constant_substring_fn;
1034
87
            state->scalar_function = constant_substring_fn_scalar;
1035
256
        } else {
1036
256
            hs_database_t* database = nullptr;
1037
256
            hs_scratch_t* scratch = nullptr;
1038
256
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
1039
                // use hyperscan
1040
242
                state->search_state.hs_database.reset(database);
1041
242
                state->search_state.hs_scratch.reset(scratch);
1042
242
            } else {
1043
                // fallback to re2
1044
                // reset hs_database to nullptr to indicate not use hyperscan
1045
14
                state->search_state.hs_database.reset();
1046
14
                state->search_state.hs_scratch.reset();
1047
14
                RE2::Options opts;
1048
14
                opts.set_never_nl(false);
1049
14
                opts.set_dot_nl(true);
1050
14
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1051
14
                if (!state->search_state.regex->ok()) {
1052
9
                    if (!context->state()->enable_extended_regex()) {
1053
1
                        return Status::InternalError(
1054
1
                                "Invalid regex expression: {}. Error: {}. If you need advanced "
1055
1
                                "regex features, try setting enable_extended_regex=true",
1056
1
                                pattern_str, state->search_state.regex->error());
1057
1
                    }
1058
1059
                    // RE2 failed, fallback to Boost.Regex
1060
                    // This handles advanced regex features like zero-width assertions
1061
8
                    state->search_state.regex.reset();
1062
8
                    try {
1063
8
                        state->search_state.boost_regex =
1064
8
                                std::make_unique<boost::regex>(pattern_str);
1065
8
                    } catch (const boost::regex_error& e) {
1066
0
                        return Status::InternalError("Invalid regex expression: {}. Error: {}",
1067
0
                                                     pattern_str, e.what());
1068
0
                    }
1069
8
                }
1070
14
            }
1071
255
            state->function = constant_regex_fn;
1072
255
            state->scalar_function = constant_regex_fn_scalar;
1073
255
        }
1074
505
    }
1075
560
    return Status::OK();
1076
561
}
1077
1078
8
void register_function_like(SimpleFunctionFactory& factory) {
1079
8
    factory.register_function<FunctionLike>();
1080
8
}
1081
1082
8
void register_function_regexp(SimpleFunctionFactory& factory) {
1083
8
    factory.register_function<FunctionRegexpLike>();
1084
8
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1085
8
}
1086
} // namespace doris