Coverage Report

Created: 2026-04-15 20:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "core/block/block.h"
31
#include "core/block/column_with_type_and_name.h"
32
#include "core/column/column.h"
33
#include "core/column/column_const.h"
34
#include "core/column/column_vector.h"
35
#include "core/string_ref.h"
36
#include "exprs/function/simple_function_factory.h"
37
38
namespace doris {
39
// A regex to match any regex pattern is equivalent to a substring search.
40
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
41
42
// A regex to match any regex pattern which is equivalent to matching a constant string
43
// at the end of the string values.
44
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
45
46
// A regex to match any regex pattern which is equivalent to matching a constant string
47
// at the end of the string values.
48
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
49
50
// A regex to match any regex pattern which is equivalent to a constant string match.
51
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
52
// A regex to match .*
53
static const RE2 ALLPASS_RE(R"((\.\*)+)");
54
55
// Like patterns
56
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
57
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
58
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
59
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
60
static const re2::RE2 LIKE_ALLPASS_RE("%+");
61
62
struct VectorAllpassSearchState : public VectorPatternSearchState {
63
307
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
64
65
307
    ~VectorAllpassSearchState() override = default;
66
67
312
    void like_pattern_match(const std::string& pattern_str) override {
68
312
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
69
26
            _search_strings->insert_default();
70
286
        } else {
71
286
            _pattern_matched = false;
72
286
        }
73
312
    }
74
75
0
    void regexp_pattern_match(const std::string& pattern_str) override {
76
0
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
77
0
            _search_strings->insert_default();
78
0
        } else {
79
0
            _pattern_matched = false;
80
0
        }
81
0
    }
82
};
83
84
struct VectorEqualSearchState : public VectorPatternSearchState {
85
307
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
86
87
307
    ~VectorEqualSearchState() override = default;
88
89
315
    void like_pattern_match(const std::string& pattern_str) override {
90
315
        _search_string.clear();
91
315
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
92
86
            FunctionLike::remove_escape_character(&_search_string);
93
86
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
94
229
        } else {
95
229
            _pattern_matched = false;
96
229
        }
97
315
    }
98
99
0
    void regexp_pattern_match(const std::string& pattern_str) override {
100
0
        _search_string.clear();
101
0
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
102
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
103
0
        } else {
104
0
            _pattern_matched = false;
105
0
        }
106
0
    }
107
};
108
109
struct VectorSubStringSearchState : public VectorPatternSearchState {
110
    VectorSubStringSearchState()
111
307
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
112
113
307
    ~VectorSubStringSearchState() override = default;
114
115
321
    void like_pattern_match(const std::string& pattern_str) override {
116
321
        _search_string.clear();
117
321
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
118
41
            FunctionLike::remove_escape_character(&_search_string);
119
41
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
120
280
        } else {
121
280
            _pattern_matched = false;
122
280
        }
123
321
    }
124
125
0
    void regexp_pattern_match(const std::string& pattern_str) override {
126
0
        _search_string.clear();
127
0
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
128
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
129
0
        } else {
130
0
            _pattern_matched = false;
131
0
        }
132
0
    }
133
};
134
135
struct VectorStartsWithSearchState : public VectorPatternSearchState {
136
    VectorStartsWithSearchState()
137
307
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
138
139
307
    ~VectorStartsWithSearchState() override = default;
140
141
313
    void like_pattern_match(const std::string& pattern_str) override {
142
313
        _search_string.clear();
143
313
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
144
37
            FunctionLike::remove_escape_character(&_search_string);
145
37
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
146
276
        } else {
147
276
            _pattern_matched = false;
148
276
        }
149
313
    }
150
151
0
    void regexp_pattern_match(const std::string& pattern_str) override {
152
0
        _search_string.clear();
153
0
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
154
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
155
0
        } else {
156
0
            _pattern_matched = false;
157
0
        }
158
0
    }
159
};
160
161
struct VectorEndsWithSearchState : public VectorPatternSearchState {
162
307
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
163
164
307
    ~VectorEndsWithSearchState() override = default;
165
166
312
    void like_pattern_match(const std::string& pattern_str) override {
167
312
        _search_string.clear();
168
312
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
169
36
            FunctionLike::remove_escape_character(&_search_string);
170
36
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
171
276
        } else {
172
276
            _pattern_matched = false;
173
276
        }
174
312
    }
175
176
0
    void regexp_pattern_match(const std::string& pattern_str) override {
177
0
        _search_string.clear();
178
0
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
179
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
180
0
        } else {
181
0
            _pattern_matched = false;
182
0
        }
183
0
    }
184
};
185
186
0
Status LikeSearchState::clone(LikeSearchState& cloned) {
187
0
    cloned.set_search_string(search_string);
188
189
0
    std::string re_pattern;
190
0
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
191
0
    if (hs_database) { // use hyperscan
192
0
        hs_database_t* database = nullptr;
193
0
        hs_scratch_t* scratch = nullptr;
194
0
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
195
196
0
        cloned.hs_database.reset(database);
197
0
        cloned.hs_scratch.reset(scratch);
198
0
    } else { // fallback to re2
199
0
        cloned.hs_database.reset();
200
0
        cloned.hs_scratch.reset();
201
202
0
        RE2::Options opts;
203
0
        opts.set_never_nl(false);
204
0
        opts.set_dot_nl(true);
205
0
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
206
0
        if (!cloned.regex->ok()) {
207
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
208
0
        }
209
0
    }
210
211
0
    return Status::OK();
212
0
}
213
214
Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals,
215
                                             const StringRef& pattern,
216
14
                                             ColumnUInt8::Container& result) {
217
14
    memset(result.data(), 1, vals.size());
218
14
    return Status::OK();
219
14
}
220
221
Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state,
222
                                                    const StringRef& val, const StringRef& pattern,
223
0
                                                    unsigned char* result) {
224
0
    *result = 1;
225
0
    return Status::OK();
226
0
}
227
228
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
229
                                           const ColumnString& search_strings,
230
21
                                           ColumnUInt8::Container& result) {
231
21
    DCHECK(vals.size() == search_strings.size());
232
21
    DCHECK(vals.size() == result.size());
233
21
    memset(result.data(), 1, vals.size());
234
21
    return Status::OK();
235
21
}
236
237
Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state,
238
                                                 const ColumnString& val, const StringRef& pattern,
239
22
                                                 ColumnUInt8::Container& result) {
240
22
    auto sz = val.size();
241
44
    for (size_t i = 0; i < sz; i++) {
242
22
        const auto& str_ref = val.get_data_at(i);
243
22
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
244
22
                    str_ref.start_with(state->search_string_sv);
245
22
    }
246
22
    return Status::OK();
247
22
}
248
249
Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state,
250
                                                        const StringRef& val,
251
                                                        const StringRef& pattern,
252
0
                                                        unsigned char* result) {
253
0
    *result = (val.size >= state->search_string_sv.size) &&
254
0
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
255
0
    return Status::OK();
256
0
}
257
258
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
259
                                               const ColumnString& search_strings,
260
31
                                               ColumnUInt8::Container& result) {
261
31
    DCHECK(vals.size() == search_strings.size());
262
31
    DCHECK(vals.size() == result.size());
263
31
    auto sz = vals.size();
264
67
    for (size_t i = 0; i < sz; ++i) {
265
36
        const auto& str_sv = vals.get_data_at(i);
266
36
        const auto& search_string_sv = search_strings.get_data_at(i);
267
36
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
268
36
    }
269
31
    return Status::OK();
270
31
}
271
272
Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state,
273
                                               const ColumnString& val, const StringRef& pattern,
274
22
                                               ColumnUInt8::Container& result) {
275
22
    auto sz = val.size();
276
44
    for (size_t i = 0; i < sz; i++) {
277
22
        const auto& str_ref = val.get_data_at(i);
278
22
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
279
22
                    str_ref.end_with(state->search_string_sv);
280
22
    }
281
22
    return Status::OK();
282
22
}
283
284
Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state,
285
                                                      const StringRef& val,
286
                                                      const StringRef& pattern,
287
0
                                                      unsigned char* result) {
288
0
    *result = (val.size >= state->search_string_sv.size) &&
289
0
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
290
0
                                                        state->search_string_sv.size));
291
0
    return Status::OK();
292
0
}
293
294
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
295
                                             const ColumnString& search_strings,
296
31
                                             ColumnUInt8::Container& result) {
297
31
    DCHECK(vals.size() == search_strings.size());
298
31
    DCHECK(vals.size() == result.size());
299
31
    auto sz = vals.size();
300
67
    for (size_t i = 0; i < sz; ++i) {
301
36
        const auto& str_sv = vals.get_data_at(i);
302
36
        const auto& search_string_sv = search_strings.get_data_at(i);
303
36
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
304
36
    }
305
31
    return Status::OK();
306
31
}
307
308
Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
309
                                            const StringRef& pattern,
310
47
                                            ColumnUInt8::Container& result) {
311
47
    auto sz = val.size();
312
94
    for (size_t i = 0; i < sz; i++) {
313
47
        result[i] = (val.get_data_at(i) == state->search_string_sv);
314
47
    }
315
47
    return Status::OK();
316
47
}
317
318
Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state,
319
                                                   const StringRef& val, const StringRef& pattern,
320
0
                                                   unsigned char* result) {
321
0
    *result = (val == state->search_string_sv);
322
0
    return Status::OK();
323
0
}
324
325
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
326
                                          const ColumnString& search_strings,
327
78
                                          ColumnUInt8::Container& result) {
328
78
    DCHECK(vals.size() == search_strings.size());
329
78
    DCHECK(vals.size() == result.size());
330
78
    auto sz = vals.size();
331
156
    for (size_t i = 0; i < sz; ++i) {
332
78
        const auto& str_sv = vals.get_data_at(i);
333
78
        const auto& search_string_sv = search_strings.get_data_at(i);
334
78
        result[i] = str_sv == search_string_sv;
335
78
    }
336
78
    return Status::OK();
337
78
}
338
339
Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state,
340
                                               const ColumnString& val, const StringRef& pattern,
341
0
                                               ColumnUInt8::Container& result) {
342
0
    auto sz = val.size();
343
0
    for (size_t i = 0; i < sz; i++) {
344
0
        if (state->search_string_sv.size == 0) {
345
0
            result[i] = true;
346
0
            continue;
347
0
        }
348
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
349
0
    }
350
0
    return Status::OK();
351
0
}
352
353
Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state,
354
                                                      const StringRef& val,
355
                                                      const StringRef& pattern,
356
0
                                                      unsigned char* result) {
357
0
    if (state->search_string_sv.size == 0) {
358
0
        *result = true;
359
0
        return Status::OK();
360
0
    }
361
0
    *result = state->substring_pattern.search(val) != -1;
362
0
    return Status::OK();
363
0
}
364
365
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
366
                                             const ColumnString& search_strings,
367
27
                                             ColumnUInt8::Container& result) {
368
27
    DCHECK(vals.size() == search_strings.size());
369
27
    DCHECK(vals.size() == result.size());
370
27
    auto sz = vals.size();
371
54
    for (size_t i = 0; i < sz; ++i) {
372
27
        const auto& str_sv = vals.get_data_at(i);
373
27
        const auto& search_string_sv = search_strings.get_data_at(i);
374
27
        if (search_string_sv.size == 0) {
375
0
            result[i] = true;
376
0
            continue;
377
0
        }
378
27
        doris::StringSearch substring_search(&search_string_sv);
379
27
        result[i] = substring_search.search(str_sv) != -1;
380
27
    }
381
27
    return Status::OK();
382
27
}
383
384
Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state,
385
                                                  const StringRef& val, const StringRef& pattern,
386
53
                                                  unsigned char* result) {
387
53
    if (state->hs_database) { // use hyperscan
388
53
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
389
53
                           state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
390
53
                           (void*)result);
391
53
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
392
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
393
0
        }
394
53
    } else if (state->boost_regex) { // use boost::regex for advanced features
395
0
        *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex);
396
0
    } else { // fallback to re2
397
0
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
398
0
    }
399
400
53
    return Status::OK();
401
53
}
402
403
Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
404
90
                                          const StringRef& pattern, unsigned char* result) {
405
90
    RE2::Options opts;
406
90
    opts.set_never_nl(false);
407
90
    opts.set_dot_nl(true);
408
90
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
409
90
    if (re.ok()) {
410
90
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
411
90
    } else {
412
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
413
0
    }
414
415
90
    return Status::OK();
416
90
}
417
418
Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
419
                                           const StringRef& pattern,
420
75
                                           ColumnUInt8::Container& result) {
421
75
    auto sz = val.size();
422
75
    if (state->hs_database) { // use hyperscan
423
150
        for (size_t i = 0; i < sz; i++) {
424
75
            const auto& str_ref = val.get_data_at(i);
425
75
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
426
75
                               state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler,
427
75
                               (void*)(result.data() + i));
428
75
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
429
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
430
0
            }
431
75
        }
432
75
    } else if (state->boost_regex) { // use boost::regex for advanced features
433
0
        for (size_t i = 0; i < sz; i++) {
434
0
            const auto& str_ref = val.get_data_at(i);
435
0
            *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size,
436
0
                                                       *state->boost_regex);
437
0
        }
438
0
    } else { // fallback to re2
439
0
        for (size_t i = 0; i < sz; i++) {
440
0
            const auto& str_ref = val.get_data_at(i);
441
0
            *(result.data() + i) =
442
0
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
443
0
        }
444
0
    }
445
446
75
    return Status::OK();
447
75
}
448
449
Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val,
450
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
451
0
    std::string re_pattern(pattern.data, pattern.size);
452
453
0
    hs_database_t* database = nullptr;
454
0
    hs_scratch_t* scratch = nullptr;
455
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
456
0
        auto sz = val.size();
457
0
        for (size_t i = 0; i < sz; i++) {
458
0
            const auto& str_ref = val.get_data_at(i);
459
0
            auto ret =
460
0
                    hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
461
0
                            doris::LikeSearchState::hs_match_handler, (void*)(result.data() + i));
462
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
463
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
464
0
            }
465
0
        }
466
467
0
        hs_free_scratch(scratch);
468
0
        hs_free_database(database);
469
0
    } else { // fallback to re2
470
0
        RE2::Options opts;
471
0
        opts.set_never_nl(false);
472
0
        opts.set_dot_nl(true);
473
0
        re2::RE2 re(re_pattern, opts);
474
0
        if (re.ok()) {
475
0
            auto sz = val.size();
476
0
            for (size_t i = 0; i < sz; i++) {
477
0
                const auto& str_ref = val.get_data_at(i);
478
0
                *(result.data() + i) =
479
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
480
0
            }
481
0
        } else {
482
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
483
0
        }
484
0
    }
485
486
0
    return Status::OK();
487
0
}
488
489
// hyperscan compile expression to database and allocate scratch space
490
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
491
132
                                    hs_database_t** database, hs_scratch_t** scratch) {
492
132
    hs_compile_error_t* compile_err;
493
132
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
494
132
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
495
496
132
    if (res != HS_SUCCESS) {
497
0
        *database = nullptr;
498
0
        std::string error_message = compile_err->message;
499
0
        hs_free_compile_error(compile_err);
500
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
501
0
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
502
0
    }
503
132
    hs_free_compile_error(compile_err);
504
505
132
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
506
0
        hs_free_database(*database);
507
0
        *database = nullptr;
508
0
        *scratch = nullptr;
509
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
510
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
511
0
    }
512
513
132
    return Status::OK();
514
132
}
515
516
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
517
                                      const ColumnNumbers& arguments, uint32_t result,
518
553
                                      size_t input_rows_count) const {
519
553
    const auto values_col =
520
553
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
521
553
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
522
523
553
    if (!values) {
524
0
        return Status::InternalError("Not supported input arguments types");
525
0
    }
526
    // result column
527
553
    auto res = ColumnUInt8::create();
528
553
    ColumnUInt8::Container& vec_res = res->get_data();
529
    // set default value to 0, and match functions only need to set 1/true
530
553
    vec_res.resize_fill(input_rows_count);
531
553
    auto* state = reinterpret_cast<LikeState*>(
532
553
            context->get_function_state(FunctionContext::THREAD_LOCAL));
533
    // for constant_substring_fn, use long run length search for performance
534
553
    if (constant_substring_fn ==
535
553
        *(state->function
536
553
                  .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&,
537
553
                                            const StringRef&, ColumnUInt8::Container&)>())) {
538
66
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
539
66
                                          &state->search_state));
540
487
    } else {
541
487
        const auto pattern_col = block.get_by_position(arguments[1]).column;
542
487
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
543
307
            RETURN_IF_ERROR(
544
307
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
545
307
        } else if (const auto* const_patterns =
546
180
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
547
180
            const auto& pattern_val = const_patterns->get_data_at(0);
548
180
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
549
180
                                         &state->search_state));
550
180
        } else {
551
0
            return Status::InternalError("Not supported input arguments types");
552
0
        }
553
487
    }
554
553
    block.replace_by_position(result, std::move(res));
555
553
    return Status::OK();
556
553
}
557
558
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
559
                                           const ColumnString::Offsets& value_offsets,
560
                                           ColumnUInt8::Container& result,
561
66
                                           LikeSearchState* search_state) const {
562
    // treat continuous multi string data as a long string data
563
66
    const UInt8* begin = values.data();
564
66
    const UInt8* end = begin + values.size();
565
66
    const UInt8* pos = begin;
566
567
    /// Current index in the array of strings.
568
66
    size_t i = 0;
569
66
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
570
571
    /// We will search for the next occurrence in all strings at once.
572
108
    while (pos < end) {
573
        // search return matched substring start offset
574
64
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
575
64
        if (pos >= end) {
576
22
            break;
577
22
        }
578
579
        /// Determine which index it refers to.
580
        /// begin + value_offsets[i] is the start offset of string at i+1
581
42
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
582
0
            ++i;
583
0
        }
584
585
        /// We check that the entry does not pass through the boundaries of strings.
586
42
        if (pos + needle_size <= begin + value_offsets[i]) {
587
42
            result[i] = 1;
588
42
        }
589
590
        // move to next string offset
591
42
        pos = begin + value_offsets[i];
592
42
        ++i;
593
42
    }
594
595
66
    return Status::OK();
596
66
}
597
598
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
599
                                      ColumnUInt8::Container& result, const LikeFn& function,
600
180
                                      LikeSearchState* search_state) const {
601
180
    RETURN_IF_ERROR((function)(search_state, values,
602
180
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
603
180
    return Status::OK();
604
180
}
605
606
template <bool LIKE_PATTERN>
607
307
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
307
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
307
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
307
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
307
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
307
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
307
    size_t size = patterns.size();
614
615
652
    for (size_t i = 0; i < size; ++i) {
616
357
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
357
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
357
            !ends_with_state->_pattern_matched) {
619
12
            return nullptr;
620
12
        }
621
345
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
345
        if (allpass_state->_pattern_matched) {
623
312
            if constexpr (LIKE_PATTERN) {
624
312
                allpass_state->like_pattern_match(pattern_str);
625
312
            } else {
626
0
                allpass_state->regexp_pattern_match(pattern_str);
627
0
            }
628
312
        }
629
345
        if (equal_state->_pattern_matched) {
630
315
            if constexpr (LIKE_PATTERN) {
631
315
                equal_state->like_pattern_match(pattern_str);
632
315
            } else {
633
0
                equal_state->regexp_pattern_match(pattern_str);
634
0
            }
635
315
        }
636
345
        if (substring_state->_pattern_matched) {
637
321
            if constexpr (LIKE_PATTERN) {
638
321
                substring_state->like_pattern_match(pattern_str);
639
321
            } else {
640
0
                substring_state->regexp_pattern_match(pattern_str);
641
0
            }
642
321
        }
643
345
        if (starts_with_state->_pattern_matched) {
644
313
            if constexpr (LIKE_PATTERN) {
645
313
                starts_with_state->like_pattern_match(pattern_str);
646
313
            } else {
647
0
                starts_with_state->regexp_pattern_match(pattern_str);
648
0
            }
649
313
        }
650
345
        if (ends_with_state->_pattern_matched) {
651
312
            if constexpr (LIKE_PATTERN) {
652
312
                ends_with_state->like_pattern_match(pattern_str);
653
312
            } else {
654
0
                ends_with_state->regexp_pattern_match(pattern_str);
655
0
            }
656
312
        }
657
345
    }
658
659
295
    if (allpass_state->_pattern_matched) {
660
21
        return allpass_state;
661
274
    } else if (equal_state->_pattern_matched) {
662
78
        return equal_state;
663
196
    } else if (substring_state->_pattern_matched) {
664
27
        return substring_state;
665
169
    } else if (starts_with_state->_pattern_matched) {
666
31
        return starts_with_state;
667
138
    } else if (ends_with_state->_pattern_matched) {
668
31
        return ends_with_state;
669
107
    } else {
670
107
        return nullptr;
671
107
    }
672
295
}
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
Line
Count
Source
607
307
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
608
307
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
609
307
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
610
307
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
611
307
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
612
307
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
613
307
    size_t size = patterns.size();
614
615
652
    for (size_t i = 0; i < size; ++i) {
616
357
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
617
357
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
618
357
            !ends_with_state->_pattern_matched) {
619
12
            return nullptr;
620
12
        }
621
345
        std::string pattern_str = patterns.get_data_at(i).to_string();
622
345
        if (allpass_state->_pattern_matched) {
623
312
            if constexpr (LIKE_PATTERN) {
624
312
                allpass_state->like_pattern_match(pattern_str);
625
            } else {
626
                allpass_state->regexp_pattern_match(pattern_str);
627
            }
628
312
        }
629
345
        if (equal_state->_pattern_matched) {
630
315
            if constexpr (LIKE_PATTERN) {
631
315
                equal_state->like_pattern_match(pattern_str);
632
            } else {
633
                equal_state->regexp_pattern_match(pattern_str);
634
            }
635
315
        }
636
345
        if (substring_state->_pattern_matched) {
637
321
            if constexpr (LIKE_PATTERN) {
638
321
                substring_state->like_pattern_match(pattern_str);
639
            } else {
640
                substring_state->regexp_pattern_match(pattern_str);
641
            }
642
321
        }
643
345
        if (starts_with_state->_pattern_matched) {
644
313
            if constexpr (LIKE_PATTERN) {
645
313
                starts_with_state->like_pattern_match(pattern_str);
646
            } else {
647
                starts_with_state->regexp_pattern_match(pattern_str);
648
            }
649
313
        }
650
345
        if (ends_with_state->_pattern_matched) {
651
312
            if constexpr (LIKE_PATTERN) {
652
312
                ends_with_state->like_pattern_match(pattern_str);
653
            } else {
654
                ends_with_state->regexp_pattern_match(pattern_str);
655
            }
656
312
        }
657
345
    }
658
659
295
    if (allpass_state->_pattern_matched) {
660
21
        return allpass_state;
661
274
    } else if (equal_state->_pattern_matched) {
662
78
        return equal_state;
663
196
    } else if (substring_state->_pattern_matched) {
664
27
        return substring_state;
665
169
    } else if (starts_with_state->_pattern_matched) {
666
31
        return starts_with_state;
667
138
    } else if (ends_with_state->_pattern_matched) {
668
31
        return ends_with_state;
669
107
    } else {
670
107
        return nullptr;
671
107
    }
672
295
}
Unexecuted instantiation: _ZN5doris16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE
673
674
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
675
                                          ColumnUInt8::Container& result, LikeState* state,
676
307
                                          size_t input_rows_count) const {
677
307
    ColumnString::MutablePtr replaced_patterns;
678
307
    VPatternSearchStateSPtr vector_search_state;
679
307
    if (state->is_like_pattern) {
680
307
        if (state->has_custom_escape) {
681
0
            replaced_patterns = ColumnString::create();
682
0
            for (int i = 0; i < input_rows_count; ++i) {
683
0
                std::string val =
684
0
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
685
0
                replaced_patterns->insert_data(val.c_str(), val.size());
686
0
            }
687
0
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
688
307
        } else {
689
307
            vector_search_state = pattern_type_recognition<true>(patterns);
690
307
        }
691
307
    } else {
692
0
        vector_search_state = pattern_type_recognition<false>(patterns);
693
0
    }
694
695
307
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
696
697
307
    if (vector_search_state == nullptr) {
698
        // pattern type recognition failed, use default case
699
381
        for (int i = 0; i < input_rows_count; ++i) {
700
262
            const auto pattern_val = real_pattern.get_data_at(i);
701
262
            const auto value_val = values.get_data_at(i);
702
262
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
703
262
                                                     &result[i]));
704
262
        }
705
119
        return Status::OK();
706
119
    }
707
188
    const auto* search_strings =
708
188
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
709
188
    return (vector_search_state->_vector_function)(values, *search_strings, result);
710
307
}
711
712
Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val,
713
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
714
0
    std::string re_pattern;
715
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
716
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
717
0
}
718
719
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val,
720
209
                                    const StringRef& pattern, unsigned char* result) {
721
    // Try to use fast path to avoid regex compilation
722
209
    std::string search_string;
723
209
    LikeFastPath fast_path = extract_like_fast_path(pattern.data, pattern.size, search_string);
724
725
209
    switch (fast_path) {
726
6
    case LikeFastPath::ALLPASS:
727
6
        *result = 1;
728
6
        return Status::OK();
729
59
    case LikeFastPath::EQUALS:
730
59
        *result = (val.size == search_string.size() &&
731
59
                   (search_string.empty() ||
732
33
                    memcmp(val.data, search_string.data(), search_string.size()) == 0));
733
59
        return Status::OK();
734
16
    case LikeFastPath::STARTS_WITH:
735
16
        *result = (val.size >= search_string.size() &&
736
16
                   memcmp(val.data, search_string.data(), search_string.size()) == 0);
737
16
        return Status::OK();
738
10
    case LikeFastPath::ENDS_WITH:
739
10
        *result = (val.size >= search_string.size() &&
740
10
                   memcmp(val.data + val.size - search_string.size(), search_string.data(),
741
9
                          search_string.size()) == 0);
742
10
        return Status::OK();
743
28
    case LikeFastPath::SUBSTRING:
744
28
        if (search_string.empty()) {
745
0
            *result = 1;
746
28
        } else {
747
            // Use memmem for substring search
748
28
            *result = (memmem(val.data, val.size, search_string.data(), search_string.size()) !=
749
28
                       nullptr);
750
28
        }
751
28
        return Status::OK();
752
90
    case LikeFastPath::REGEX:
753
90
    default:
754
        // Fall back to regex matching
755
90
        std::string re_pattern;
756
90
        convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
757
90
        return regexp_fn_scalar(state, StringRef(val.data, val.size),
758
90
                                {re_pattern.c_str(), re_pattern.size()}, result);
759
209
    }
760
209
}
761
762
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
763
200
                                        std::string* re_pattern) {
764
200
    re_pattern->clear();
765
766
200
    if (pattern.empty()) {
767
0
        re_pattern->append("^$");
768
0
        return;
769
0
    }
770
771
    // add ^ to pattern head to match line head
772
200
    if (!pattern.empty() && pattern[0] != '%') {
773
168
        re_pattern->append("^");
774
168
    }
775
776
    // expect % and _, all chars should keep it literal mean.
777
1.41k
    for (size_t i = 0; i < pattern.size(); i++) {
778
1.21k
        char c = pattern[i];
779
1.21k
        if (c == '\\' && i + 1 < pattern.size()) {
780
38
            char next_c = pattern[i + 1];
781
38
            if (next_c == '%' || next_c == '_') {
782
                // convert "\%" and "\_" to literal "%" and "_"
783
16
                re_pattern->append(1, next_c);
784
16
                i++;
785
16
                continue;
786
22
            } else if (next_c == '\\') {
787
                // keep valid escape "\\"
788
6
                re_pattern->append("\\\\");
789
6
                i++;
790
6
                continue;
791
6
            }
792
38
        }
793
794
1.19k
        if (c == '%') {
795
192
            re_pattern->append(".*");
796
999
        } else if (c == '_') {
797
256
            re_pattern->append(".");
798
743
        } else {
799
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
800
743
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
801
743
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
802
743
                c == '.' || c == '$' || c == '?') {
803
28
                re_pattern->append(1, '\\');
804
28
            }
805
743
            re_pattern->append(1, c);
806
743
        }
807
1.19k
    }
808
809
    // add $ to pattern tail to match line tail
810
200
    if (!pattern.empty() && re_pattern->back() != '*') {
811
140
        re_pattern->append("$");
812
140
    }
813
200
}
814
815
415
void FunctionLike::remove_escape_character(std::string* search_string) {
816
415
    std::string tmp_search_string;
817
415
    tmp_search_string.swap(*search_string);
818
415
    int64_t len = tmp_search_string.length();
819
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
820
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
821
1.87k
    for (int i = 0; i < len;) {
822
1.46k
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
823
1.46k
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
824
44
             tmp_search_string[i + 1] == '\\')) {
825
36
            search_string->append(1, tmp_search_string[i + 1]);
826
36
            i += 2;
827
1.42k
        } else {
828
1.42k
            search_string->append(1, tmp_search_string[i]);
829
1.42k
            i++;
830
1.42k
        }
831
1.46k
    }
832
415
}
833
834
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
835
0
    if (!re.ok()) {
836
0
        return false;
837
0
    }
838
839
0
    std::vector<RE2::Arg> arguments;
840
0
    std::vector<RE2::Arg*> arguments_ptrs;
841
0
    std::size_t args_count = re.NumberOfCapturingGroups();
842
0
    arguments.resize(args_count);
843
0
    arguments_ptrs.resize(args_count);
844
0
    results.resize(args_count);
845
0
    for (std::size_t i = 0; i < args_count; ++i) {
846
0
        arguments[i] = &results[i];
847
0
        arguments_ptrs[i] = &arguments[i];
848
0
    }
849
850
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
851
0
}
852
853
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
854
0
    std::vector<std::string> results;
855
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
856
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
857
0
    if (re2_full_match(str, re, results)) {
858
0
        for (int i = 0; i < results.size(); ++i) {
859
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
860
0
        }
861
0
    } else {
862
0
        VLOG_DEBUG << "no match";
863
0
    }
864
0
}
865
866
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
867
                                                std::shared_ptr<LikeState>& state,
868
347
                                                bool try_hyperscan) {
869
347
    std::string pattern_str;
870
347
    if (state->has_custom_escape) {
871
1
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
872
346
    } else {
873
346
        pattern_str = pattern.to_string();
874
346
    }
875
347
    state->search_state.pattern_str = pattern_str;
876
347
    std::string search_string;
877
878
347
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
879
22
        state->search_state.set_search_string("");
880
22
        state->function = constant_allpass_fn;
881
22
        state->scalar_function = constant_allpass_fn_scalar;
882
325
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
883
95
        if (VLOG_DEBUG_IS_ON) {
884
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
885
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
886
0
        }
887
95
        remove_escape_character(&search_string);
888
95
        if (VLOG_DEBUG_IS_ON) {
889
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
890
0
                       << ", size: " << search_string.size();
891
0
        }
892
95
        state->search_state.set_search_string(search_string);
893
95
        state->function = constant_equals_fn;
894
95
        state->scalar_function = constant_equals_fn_scalar;
895
230
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
896
32
        if (VLOG_DEBUG_IS_ON) {
897
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
898
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
899
0
        }
900
32
        remove_escape_character(&search_string);
901
32
        if (VLOG_DEBUG_IS_ON) {
902
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
903
0
                       << ", size: " << search_string.size();
904
0
        }
905
32
        state->search_state.set_search_string(search_string);
906
32
        state->function = constant_starts_with_fn;
907
32
        state->scalar_function = constant_starts_with_fn_scalar;
908
198
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
909
32
        if (VLOG_DEBUG_IS_ON) {
910
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
911
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
912
0
        }
913
32
        remove_escape_character(&search_string);
914
32
        if (VLOG_DEBUG_IS_ON) {
915
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
916
0
                       << ", size: " << search_string.size();
917
0
        }
918
32
        state->search_state.set_search_string(search_string);
919
32
        state->function = constant_ends_with_fn;
920
32
        state->scalar_function = constant_ends_with_fn_scalar;
921
166
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
922
56
        if (VLOG_DEBUG_IS_ON) {
923
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
924
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
925
0
        }
926
56
        remove_escape_character(&search_string);
927
56
        if (VLOG_DEBUG_IS_ON) {
928
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
929
0
                       << ", size: " << search_string.size();
930
0
        }
931
56
        state->search_state.set_search_string(search_string);
932
56
        state->function = constant_substring_fn;
933
56
        state->scalar_function = constant_substring_fn_scalar;
934
110
    } else {
935
110
        std::string re_pattern;
936
110
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
937
110
        if (VLOG_DEBUG_IS_ON) {
938
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
939
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
940
0
                       << ", size: " << re_pattern.size();
941
0
        }
942
943
110
        hs_database_t* database = nullptr;
944
110
        hs_scratch_t* scratch = nullptr;
945
110
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
946
            // use hyperscan
947
110
            state->search_state.hs_database.reset(database);
948
110
            state->search_state.hs_scratch.reset(scratch);
949
110
        } else {
950
            // fallback to re2
951
            // reset hs_database to nullptr to indicate not use hyperscan
952
0
            state->search_state.hs_database.reset();
953
0
            state->search_state.hs_scratch.reset();
954
955
0
            RE2::Options opts;
956
0
            opts.set_never_nl(false);
957
0
            opts.set_dot_nl(true);
958
0
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
959
0
            if (!state->search_state.regex->ok()) {
960
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
961
0
                                             pattern_str);
962
0
            }
963
0
        }
964
965
110
        state->function = constant_regex_fn;
966
110
        state->scalar_function = constant_regex_fn_scalar;
967
110
    }
968
347
    return Status::OK();
969
347
}
970
971
1.07k
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
972
1.07k
    if (scope != FunctionContext::THREAD_LOCAL) {
973
536
        return Status::OK();
974
536
    }
975
536
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
976
536
    state->is_like_pattern = true;
977
536
    state->function = like_fn;
978
536
    state->scalar_function = like_fn_scalar;
979
536
    if (context->is_col_constant(2)) {
980
1
        state->has_custom_escape = true;
981
1
        const auto escape_col = context->get_constant_col(2)->column_ptr;
982
1
        const auto& escape = escape_col->get_data_at(0);
983
1
        if (escape.size != 1) {
984
0
            return Status::InternalError("Escape character must be a single character, got: {}",
985
0
                                         escape.to_string());
986
0
        }
987
1
        state->escape_char = escape.data[0];
988
1
    }
989
536
    if (context->is_col_constant(1)) {
990
347
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
991
347
        const auto& pattern = pattern_col->get_data_at(0);
992
347
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
993
347
    }
994
536
    context->set_function_state(scope, state);
995
996
536
    return Status::OK();
997
536
}
998
999
Status FunctionRegexpLike::open(FunctionContext* context,
1000
122
                                FunctionContext::FunctionStateScope scope) {
1001
122
    if (scope != FunctionContext::THREAD_LOCAL) {
1002
61
        return Status::OK();
1003
61
    }
1004
61
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
1005
61
    context->set_function_state(scope, state);
1006
61
    state->is_like_pattern = false;
1007
61
    state->function = regexp_fn;
1008
61
    state->scalar_function = regexp_fn_scalar;
1009
61
    if (context->is_col_constant(1)) {
1010
61
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
1011
61
        const auto& pattern = pattern_col->get_data_at(0);
1012
1013
61
        std::string pattern_str = pattern.to_string();
1014
61
        std::string search_string;
1015
61
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
1016
4
            state->search_state.set_search_string("");
1017
4
            state->function = constant_allpass_fn;
1018
4
            state->scalar_function = constant_allpass_fn_scalar;
1019
57
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
1020
7
            state->search_state.set_search_string(search_string);
1021
7
            state->function = constant_equals_fn;
1022
7
            state->scalar_function = constant_equals_fn_scalar;
1023
50
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
1024
7
            state->search_state.set_search_string(search_string);
1025
7
            state->function = constant_starts_with_fn;
1026
7
            state->scalar_function = constant_starts_with_fn_scalar;
1027
43
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
1028
7
            state->search_state.set_search_string(search_string);
1029
7
            state->function = constant_ends_with_fn;
1030
7
            state->scalar_function = constant_ends_with_fn_scalar;
1031
36
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
1032
14
            state->search_state.set_search_string(search_string);
1033
14
            state->function = constant_substring_fn;
1034
14
            state->scalar_function = constant_substring_fn_scalar;
1035
22
        } else {
1036
22
            hs_database_t* database = nullptr;
1037
22
            hs_scratch_t* scratch = nullptr;
1038
22
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
1039
                // use hyperscan
1040
22
                state->search_state.hs_database.reset(database);
1041
22
                state->search_state.hs_scratch.reset(scratch);
1042
22
            } else {
1043
                // fallback to re2
1044
                // reset hs_database to nullptr to indicate not use hyperscan
1045
0
                state->search_state.hs_database.reset();
1046
0
                state->search_state.hs_scratch.reset();
1047
0
                RE2::Options opts;
1048
0
                opts.set_never_nl(false);
1049
0
                opts.set_dot_nl(true);
1050
0
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1051
0
                if (!state->search_state.regex->ok()) {
1052
0
                    if (!context->state()->enable_extended_regex()) {
1053
0
                        return Status::InternalError(
1054
0
                                "Invalid regex expression: {}. Error: {}. If you need advanced "
1055
0
                                "regex features, try setting enable_extended_regex=true",
1056
0
                                pattern_str, state->search_state.regex->error());
1057
0
                    }
1058
1059
                    // RE2 failed, fallback to Boost.Regex
1060
                    // This handles advanced regex features like zero-width assertions
1061
0
                    state->search_state.regex.reset();
1062
0
                    try {
1063
0
                        state->search_state.boost_regex =
1064
0
                                std::make_unique<boost::regex>(pattern_str);
1065
0
                    } catch (const boost::regex_error& e) {
1066
0
                        return Status::InternalError("Invalid regex expression: {}. Error: {}",
1067
0
                                                     pattern_str, e.what());
1068
0
                    }
1069
0
                }
1070
0
            }
1071
22
            state->function = constant_regex_fn;
1072
22
            state->scalar_function = constant_regex_fn_scalar;
1073
22
        }
1074
61
    }
1075
61
    return Status::OK();
1076
61
}
1077
1078
1
void register_function_like(SimpleFunctionFactory& factory) {
1079
1
    factory.register_function<FunctionLike>();
1080
1
}
1081
1082
1
void register_function_regexp(SimpleFunctionFactory& factory) {
1083
1
    factory.register_function<FunctionRegexpLike>();
1084
1
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1085
1
}
1086
} // namespace doris