Coverage Report

Created: 2025-11-14 17:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/vec/functions/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "vec/columns/column.h"
31
#include "vec/columns/column_const.h"
32
#include "vec/columns/column_vector.h"
33
#include "vec/common/string_ref.h"
34
#include "vec/core/block.h"
35
#include "vec/core/column_with_type_and_name.h"
36
#include "vec/functions/simple_function_factory.h"
37
38
namespace doris::vectorized {
39
#include "common/compile_check_begin.h"
40
// A regex to match any regex pattern is equivalent to a substring search.
41
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
42
43
// A regex to match any regex pattern which is equivalent to matching a constant string
44
// at the end of the string values.
45
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
46
47
// A regex to match any regex pattern which is equivalent to matching a constant string
48
// at the end of the string values.
49
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
50
51
// A regex to match any regex pattern which is equivalent to a constant string match.
52
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
53
// A regex to match .*
54
static const RE2 ALLPASS_RE(R"((\.\*)+)");
55
56
// Like patterns
57
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
58
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
59
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
60
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
61
static const re2::RE2 LIKE_ALLPASS_RE("%+");
62
63
struct VectorAllpassSearchState : public VectorPatternSearchState {
64
307
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
65
66
307
    ~VectorAllpassSearchState() override = default;
67
68
312
    void like_pattern_match(const std::string& pattern_str) override {
69
312
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
70
26
            _search_strings->insert_default();
71
286
        } else {
72
286
            _pattern_matched = false;
73
286
        }
74
312
    }
75
76
0
    void regexp_pattern_match(const std::string& pattern_str) override {
77
0
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
78
0
            _search_strings->insert_default();
79
0
        } else {
80
0
            _pattern_matched = false;
81
0
        }
82
0
    }
83
};
84
85
struct VectorEqualSearchState : public VectorPatternSearchState {
86
307
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
87
88
307
    ~VectorEqualSearchState() override = default;
89
90
315
    void like_pattern_match(const std::string& pattern_str) override {
91
315
        _search_string.clear();
92
315
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
93
86
            FunctionLike::remove_escape_character(&_search_string);
94
86
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
95
229
        } else {
96
229
            _pattern_matched = false;
97
229
        }
98
315
    }
99
100
0
    void regexp_pattern_match(const std::string& pattern_str) override {
101
0
        _search_string.clear();
102
0
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
103
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
104
0
        } else {
105
0
            _pattern_matched = false;
106
0
        }
107
0
    }
108
};
109
110
struct VectorSubStringSearchState : public VectorPatternSearchState {
111
    VectorSubStringSearchState()
112
307
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
113
114
307
    ~VectorSubStringSearchState() override = default;
115
116
321
    void like_pattern_match(const std::string& pattern_str) override {
117
321
        _search_string.clear();
118
321
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
119
41
            FunctionLike::remove_escape_character(&_search_string);
120
41
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
121
280
        } else {
122
280
            _pattern_matched = false;
123
280
        }
124
321
    }
125
126
0
    void regexp_pattern_match(const std::string& pattern_str) override {
127
0
        _search_string.clear();
128
0
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
129
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
130
0
        } else {
131
0
            _pattern_matched = false;
132
0
        }
133
0
    }
134
};
135
136
struct VectorStartsWithSearchState : public VectorPatternSearchState {
137
    VectorStartsWithSearchState()
138
307
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
139
140
307
    ~VectorStartsWithSearchState() override = default;
141
142
313
    void like_pattern_match(const std::string& pattern_str) override {
143
313
        _search_string.clear();
144
313
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
145
37
            FunctionLike::remove_escape_character(&_search_string);
146
37
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
147
276
        } else {
148
276
            _pattern_matched = false;
149
276
        }
150
313
    }
151
152
0
    void regexp_pattern_match(const std::string& pattern_str) override {
153
0
        _search_string.clear();
154
0
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
155
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
156
0
        } else {
157
0
            _pattern_matched = false;
158
0
        }
159
0
    }
160
};
161
162
struct VectorEndsWithSearchState : public VectorPatternSearchState {
163
307
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
164
165
307
    ~VectorEndsWithSearchState() override = default;
166
167
312
    void like_pattern_match(const std::string& pattern_str) override {
168
312
        _search_string.clear();
169
312
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
170
36
            FunctionLike::remove_escape_character(&_search_string);
171
36
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
172
276
        } else {
173
276
            _pattern_matched = false;
174
276
        }
175
312
    }
176
177
0
    void regexp_pattern_match(const std::string& pattern_str) override {
178
0
        _search_string.clear();
179
0
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
180
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
181
0
        } else {
182
0
            _pattern_matched = false;
183
0
        }
184
0
    }
185
};
186
187
0
Status LikeSearchState::clone(LikeSearchState& cloned) {
188
0
    cloned.set_search_string(search_string);
189
190
0
    std::string re_pattern;
191
0
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
192
0
    if (hs_database) { // use hyperscan
193
0
        hs_database_t* database = nullptr;
194
0
        hs_scratch_t* scratch = nullptr;
195
0
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
196
197
0
        cloned.hs_database.reset(database);
198
0
        cloned.hs_scratch.reset(scratch);
199
0
    } else { // fallback to re2
200
0
        cloned.hs_database.reset();
201
0
        cloned.hs_scratch.reset();
202
203
0
        RE2::Options opts;
204
0
        opts.set_never_nl(false);
205
0
        opts.set_dot_nl(true);
206
0
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
207
0
        if (!cloned.regex->ok()) {
208
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
209
0
        }
210
0
    }
211
212
0
    return Status::OK();
213
0
}
214
215
Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals,
216
                                             const StringRef& pattern,
217
14
                                             ColumnUInt8::Container& result) {
218
14
    memset(result.data(), 1, vals.size());
219
14
    return Status::OK();
220
14
}
221
222
Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state,
223
                                                    const StringRef& val, const StringRef& pattern,
224
0
                                                    unsigned char* result) {
225
0
    *result = 1;
226
0
    return Status::OK();
227
0
}
228
229
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
230
                                           const ColumnString& search_strings,
231
21
                                           ColumnUInt8::Container& result) {
232
21
    DCHECK(vals.size() == search_strings.size());
233
21
    DCHECK(vals.size() == result.size());
234
21
    memset(result.data(), 1, vals.size());
235
21
    return Status::OK();
236
21
}
237
238
Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state,
239
                                                 const ColumnString& val, const StringRef& pattern,
240
22
                                                 ColumnUInt8::Container& result) {
241
22
    auto sz = val.size();
242
44
    for (size_t i = 0; i < sz; i++) {
243
22
        const auto& str_ref = val.get_data_at(i);
244
22
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
245
22
                    str_ref.start_with(state->search_string_sv);
246
22
    }
247
22
    return Status::OK();
248
22
}
249
250
Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state,
251
                                                        const StringRef& val,
252
                                                        const StringRef& pattern,
253
0
                                                        unsigned char* result) {
254
0
    *result = (val.size >= state->search_string_sv.size) &&
255
0
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
256
0
    return Status::OK();
257
0
}
258
259
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
260
                                               const ColumnString& search_strings,
261
31
                                               ColumnUInt8::Container& result) {
262
31
    DCHECK(vals.size() == search_strings.size());
263
31
    DCHECK(vals.size() == result.size());
264
31
    auto sz = vals.size();
265
67
    for (size_t i = 0; i < sz; ++i) {
266
36
        const auto& str_sv = vals.get_data_at(i);
267
36
        const auto& search_string_sv = search_strings.get_data_at(i);
268
36
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
269
36
    }
270
31
    return Status::OK();
271
31
}
272
273
Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state,
274
                                               const ColumnString& val, const StringRef& pattern,
275
22
                                               ColumnUInt8::Container& result) {
276
22
    auto sz = val.size();
277
44
    for (size_t i = 0; i < sz; i++) {
278
22
        const auto& str_ref = val.get_data_at(i);
279
22
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
280
22
                    str_ref.end_with(state->search_string_sv);
281
22
    }
282
22
    return Status::OK();
283
22
}
284
285
Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state,
286
                                                      const StringRef& val,
287
                                                      const StringRef& pattern,
288
0
                                                      unsigned char* result) {
289
0
    *result = (val.size >= state->search_string_sv.size) &&
290
0
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
291
0
                                                        state->search_string_sv.size));
292
0
    return Status::OK();
293
0
}
294
295
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
296
                                             const ColumnString& search_strings,
297
31
                                             ColumnUInt8::Container& result) {
298
31
    DCHECK(vals.size() == search_strings.size());
299
31
    DCHECK(vals.size() == result.size());
300
31
    auto sz = vals.size();
301
67
    for (size_t i = 0; i < sz; ++i) {
302
36
        const auto& str_sv = vals.get_data_at(i);
303
36
        const auto& search_string_sv = search_strings.get_data_at(i);
304
36
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
305
36
    }
306
31
    return Status::OK();
307
31
}
308
309
Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val,
310
                                            const StringRef& pattern,
311
47
                                            ColumnUInt8::Container& result) {
312
47
    auto sz = val.size();
313
94
    for (size_t i = 0; i < sz; i++) {
314
47
        result[i] = (val.get_data_at(i) == state->search_string_sv);
315
47
    }
316
47
    return Status::OK();
317
47
}
318
319
Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state,
320
                                                   const StringRef& val, const StringRef& pattern,
321
0
                                                   unsigned char* result) {
322
0
    *result = (val == state->search_string_sv);
323
0
    return Status::OK();
324
0
}
325
326
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
327
                                          const ColumnString& search_strings,
328
78
                                          ColumnUInt8::Container& result) {
329
78
    DCHECK(vals.size() == search_strings.size());
330
78
    DCHECK(vals.size() == result.size());
331
78
    auto sz = vals.size();
332
156
    for (size_t i = 0; i < sz; ++i) {
333
78
        const auto& str_sv = vals.get_data_at(i);
334
78
        const auto& search_string_sv = search_strings.get_data_at(i);
335
78
        result[i] = str_sv == search_string_sv;
336
78
    }
337
78
    return Status::OK();
338
78
}
339
340
Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state,
341
                                               const ColumnString& val, const StringRef& pattern,
342
0
                                               ColumnUInt8::Container& result) {
343
0
    auto sz = val.size();
344
0
    for (size_t i = 0; i < sz; i++) {
345
0
        if (state->search_string_sv.size == 0) {
346
0
            result[i] = true;
347
0
            continue;
348
0
        }
349
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
350
0
    }
351
0
    return Status::OK();
352
0
}
353
354
Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state,
355
                                                      const StringRef& val,
356
                                                      const StringRef& pattern,
357
0
                                                      unsigned char* result) {
358
0
    if (state->search_string_sv.size == 0) {
359
0
        *result = true;
360
0
        return Status::OK();
361
0
    }
362
0
    *result = state->substring_pattern.search(val) != -1;
363
0
    return Status::OK();
364
0
}
365
366
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
367
                                             const ColumnString& search_strings,
368
27
                                             ColumnUInt8::Container& result) {
369
27
    DCHECK(vals.size() == search_strings.size());
370
27
    DCHECK(vals.size() == result.size());
371
27
    auto sz = vals.size();
372
54
    for (size_t i = 0; i < sz; ++i) {
373
27
        const auto& str_sv = vals.get_data_at(i);
374
27
        const auto& search_string_sv = search_strings.get_data_at(i);
375
27
        if (search_string_sv.size == 0) {
376
0
            result[i] = true;
377
0
            continue;
378
0
        }
379
27
        doris::StringSearch substring_search(&search_string_sv);
380
27
        result[i] = substring_search.search(str_sv) != -1;
381
27
    }
382
27
    return Status::OK();
383
27
}
384
385
Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state,
386
                                                  const StringRef& val, const StringRef& pattern,
387
53
                                                  unsigned char* result) {
388
53
    if (state->hs_database) { // use hyperscan
389
53
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
390
53
                           state->hs_scratch.get(),
391
53
                           doris::vectorized::LikeSearchState::hs_match_handler, (void*)result);
392
53
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
393
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
394
0
        }
395
53
    } else if (state->boost_regex) { // use boost::regex for advanced features
396
0
        *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex);
397
0
    } else { // fallback to re2
398
0
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
399
0
    }
400
401
53
    return Status::OK();
402
53
}
403
404
Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val,
405
209
                                          const StringRef& pattern, unsigned char* result) {
406
209
    RE2::Options opts;
407
209
    opts.set_never_nl(false);
408
209
    opts.set_dot_nl(true);
409
209
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
410
209
    if (re.ok()) {
411
209
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
412
209
    } else {
413
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
414
0
    }
415
416
209
    return Status::OK();
417
209
}
418
419
Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val,
420
                                           const StringRef& pattern,
421
75
                                           ColumnUInt8::Container& result) {
422
75
    auto sz = val.size();
423
75
    if (state->hs_database) { // use hyperscan
424
150
        for (size_t i = 0; i < sz; i++) {
425
75
            const auto& str_ref = val.get_data_at(i);
426
75
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
427
75
                               state->hs_scratch.get(),
428
75
                               doris::vectorized::LikeSearchState::hs_match_handler,
429
75
                               (void*)(result.data() + i));
430
75
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
431
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
432
0
            }
433
75
        }
434
75
    } else if (state->boost_regex) { // use boost::regex for advanced features
435
0
        for (size_t i = 0; i < sz; i++) {
436
0
            const auto& str_ref = val.get_data_at(i);
437
0
            *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size,
438
0
                                                       *state->boost_regex);
439
0
        }
440
0
    } else { // fallback to re2
441
0
        for (size_t i = 0; i < sz; i++) {
442
0
            const auto& str_ref = val.get_data_at(i);
443
0
            *(result.data() + i) =
444
0
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
445
0
        }
446
0
    }
447
448
75
    return Status::OK();
449
75
}
450
451
Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val,
452
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
453
0
    std::string re_pattern(pattern.data, pattern.size);
454
455
0
    hs_database_t* database = nullptr;
456
0
    hs_scratch_t* scratch = nullptr;
457
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
458
0
        auto sz = val.size();
459
0
        for (size_t i = 0; i < sz; i++) {
460
0
            const auto& str_ref = val.get_data_at(i);
461
0
            auto ret = hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
462
0
                               doris::vectorized::LikeSearchState::hs_match_handler,
463
0
                               (void*)(result.data() + i));
464
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
465
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
466
0
            }
467
0
        }
468
469
0
        hs_free_scratch(scratch);
470
0
        hs_free_database(database);
471
0
    } else { // fallback to re2
472
0
        RE2::Options opts;
473
0
        opts.set_never_nl(false);
474
0
        opts.set_dot_nl(true);
475
0
        re2::RE2 re(re_pattern, opts);
476
0
        if (re.ok()) {
477
0
            auto sz = val.size();
478
0
            for (size_t i = 0; i < sz; i++) {
479
0
                const auto& str_ref = val.get_data_at(i);
480
0
                *(result.data() + i) =
481
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
482
0
            }
483
0
        } else {
484
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
485
0
        }
486
0
    }
487
488
0
    return Status::OK();
489
0
}
490
491
// hyperscan compile expression to database and allocate scratch space
492
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
493
132
                                    hs_database_t** database, hs_scratch_t** scratch) {
494
132
    hs_compile_error_t* compile_err;
495
132
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
496
132
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
497
498
132
    if (res != HS_SUCCESS) {
499
0
        *database = nullptr;
500
0
        std::string error_message = compile_err->message;
501
0
        hs_free_compile_error(compile_err);
502
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
503
0
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
504
0
    }
505
132
    hs_free_compile_error(compile_err);
506
507
132
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
508
0
        hs_free_database(*database);
509
0
        *database = nullptr;
510
0
        *scratch = nullptr;
511
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
512
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
513
0
    }
514
515
132
    return Status::OK();
516
132
}
517
518
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
519
                                      const ColumnNumbers& arguments, uint32_t result,
520
553
                                      size_t input_rows_count) const {
521
553
    const auto values_col =
522
553
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
523
553
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
524
525
553
    if (!values) {
526
0
        return Status::InternalError("Not supported input arguments types");
527
0
    }
528
    // result column
529
553
    auto res = ColumnUInt8::create();
530
553
    ColumnUInt8::Container& vec_res = res->get_data();
531
    // set default value to 0, and match functions only need to set 1/true
532
553
    vec_res.resize_fill(input_rows_count);
533
553
    auto* state = reinterpret_cast<LikeState*>(
534
553
            context->get_function_state(FunctionContext::THREAD_LOCAL));
535
    // for constant_substring_fn, use long run length search for performance
536
553
    if (constant_substring_fn ==
537
553
        *(state->function
538
553
                  .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&,
539
553
                                            const StringRef&, ColumnUInt8::Container&)>())) {
540
66
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
541
66
                                          &state->search_state));
542
487
    } else {
543
487
        const auto pattern_col = block.get_by_position(arguments[1]).column;
544
487
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
545
307
            RETURN_IF_ERROR(
546
307
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
547
307
        } else if (const auto* const_patterns =
548
180
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
549
180
            const auto& pattern_val = const_patterns->get_data_at(0);
550
180
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
551
180
                                         &state->search_state));
552
180
        } else {
553
0
            return Status::InternalError("Not supported input arguments types");
554
0
        }
555
487
    }
556
553
    block.replace_by_position(result, std::move(res));
557
553
    return Status::OK();
558
553
}
559
560
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
561
                                           const ColumnString::Offsets& value_offsets,
562
                                           ColumnUInt8::Container& result,
563
66
                                           LikeSearchState* search_state) const {
564
    // treat continuous multi string data as a long string data
565
66
    const UInt8* begin = values.data();
566
66
    const UInt8* end = begin + values.size();
567
66
    const UInt8* pos = begin;
568
569
    /// Current index in the array of strings.
570
66
    size_t i = 0;
571
66
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
572
573
    /// We will search for the next occurrence in all strings at once.
574
108
    while (pos < end) {
575
        // search return matched substring start offset
576
64
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
577
64
        if (pos >= end) {
578
22
            break;
579
22
        }
580
581
        /// Determine which index it refers to.
582
        /// begin + value_offsets[i] is the start offset of string at i+1
583
42
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
584
0
            ++i;
585
0
        }
586
587
        /// We check that the entry does not pass through the boundaries of strings.
588
42
        if (pos + needle_size <= begin + value_offsets[i]) {
589
42
            result[i] = 1;
590
42
        }
591
592
        // move to next string offset
593
42
        pos = begin + value_offsets[i];
594
42
        ++i;
595
42
    }
596
597
66
    return Status::OK();
598
66
}
599
600
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
601
                                      ColumnUInt8::Container& result, const LikeFn& function,
602
180
                                      LikeSearchState* search_state) const {
603
180
    RETURN_IF_ERROR((function)(search_state, values,
604
180
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
605
180
    return Status::OK();
606
180
}
607
608
template <bool LIKE_PATTERN>
609
307
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
610
307
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
611
307
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
612
307
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
613
307
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
614
307
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
615
307
    size_t size = patterns.size();
616
617
652
    for (size_t i = 0; i < size; ++i) {
618
357
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
619
357
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
620
357
            !ends_with_state->_pattern_matched) {
621
12
            return nullptr;
622
12
        }
623
345
        std::string pattern_str = patterns.get_data_at(i).to_string();
624
345
        if (allpass_state->_pattern_matched) {
625
312
            if constexpr (LIKE_PATTERN) {
626
312
                allpass_state->like_pattern_match(pattern_str);
627
312
            } else {
628
0
                allpass_state->regexp_pattern_match(pattern_str);
629
0
            }
630
312
        }
631
345
        if (equal_state->_pattern_matched) {
632
315
            if constexpr (LIKE_PATTERN) {
633
315
                equal_state->like_pattern_match(pattern_str);
634
315
            } else {
635
0
                equal_state->regexp_pattern_match(pattern_str);
636
0
            }
637
315
        }
638
345
        if (substring_state->_pattern_matched) {
639
321
            if constexpr (LIKE_PATTERN) {
640
321
                substring_state->like_pattern_match(pattern_str);
641
321
            } else {
642
0
                substring_state->regexp_pattern_match(pattern_str);
643
0
            }
644
321
        }
645
345
        if (starts_with_state->_pattern_matched) {
646
313
            if constexpr (LIKE_PATTERN) {
647
313
                starts_with_state->like_pattern_match(pattern_str);
648
313
            } else {
649
0
                starts_with_state->regexp_pattern_match(pattern_str);
650
0
            }
651
313
        }
652
345
        if (ends_with_state->_pattern_matched) {
653
312
            if constexpr (LIKE_PATTERN) {
654
312
                ends_with_state->like_pattern_match(pattern_str);
655
312
            } else {
656
0
                ends_with_state->regexp_pattern_match(pattern_str);
657
0
            }
658
312
        }
659
345
    }
660
661
295
    if (allpass_state->_pattern_matched) {
662
21
        return allpass_state;
663
274
    } else if (equal_state->_pattern_matched) {
664
78
        return equal_state;
665
196
    } else if (substring_state->_pattern_matched) {
666
27
        return substring_state;
667
169
    } else if (starts_with_state->_pattern_matched) {
668
31
        return starts_with_state;
669
138
    } else if (ends_with_state->_pattern_matched) {
670
31
        return ends_with_state;
671
107
    } else {
672
107
        return nullptr;
673
107
    }
674
295
}
_ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE
Line
Count
Source
609
307
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
610
307
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
611
307
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
612
307
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
613
307
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
614
307
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
615
307
    size_t size = patterns.size();
616
617
652
    for (size_t i = 0; i < size; ++i) {
618
357
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
619
357
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
620
357
            !ends_with_state->_pattern_matched) {
621
12
            return nullptr;
622
12
        }
623
345
        std::string pattern_str = patterns.get_data_at(i).to_string();
624
345
        if (allpass_state->_pattern_matched) {
625
312
            if constexpr (LIKE_PATTERN) {
626
312
                allpass_state->like_pattern_match(pattern_str);
627
            } else {
628
                allpass_state->regexp_pattern_match(pattern_str);
629
            }
630
312
        }
631
345
        if (equal_state->_pattern_matched) {
632
315
            if constexpr (LIKE_PATTERN) {
633
315
                equal_state->like_pattern_match(pattern_str);
634
            } else {
635
                equal_state->regexp_pattern_match(pattern_str);
636
            }
637
315
        }
638
345
        if (substring_state->_pattern_matched) {
639
321
            if constexpr (LIKE_PATTERN) {
640
321
                substring_state->like_pattern_match(pattern_str);
641
            } else {
642
                substring_state->regexp_pattern_match(pattern_str);
643
            }
644
321
        }
645
345
        if (starts_with_state->_pattern_matched) {
646
313
            if constexpr (LIKE_PATTERN) {
647
313
                starts_with_state->like_pattern_match(pattern_str);
648
            } else {
649
                starts_with_state->regexp_pattern_match(pattern_str);
650
            }
651
313
        }
652
345
        if (ends_with_state->_pattern_matched) {
653
312
            if constexpr (LIKE_PATTERN) {
654
312
                ends_with_state->like_pattern_match(pattern_str);
655
            } else {
656
                ends_with_state->regexp_pattern_match(pattern_str);
657
            }
658
312
        }
659
345
    }
660
661
295
    if (allpass_state->_pattern_matched) {
662
21
        return allpass_state;
663
274
    } else if (equal_state->_pattern_matched) {
664
78
        return equal_state;
665
196
    } else if (substring_state->_pattern_matched) {
666
27
        return substring_state;
667
169
    } else if (starts_with_state->_pattern_matched) {
668
31
        return starts_with_state;
669
138
    } else if (ends_with_state->_pattern_matched) {
670
31
        return ends_with_state;
671
107
    } else {
672
107
        return nullptr;
673
107
    }
674
295
}
Unexecuted instantiation: _ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE
675
676
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
677
                                          ColumnUInt8::Container& result, LikeState* state,
678
307
                                          size_t input_rows_count) const {
679
307
    ColumnString::MutablePtr replaced_patterns;
680
307
    VPatternSearchStateSPtr vector_search_state;
681
307
    if (state->is_like_pattern) {
682
307
        if (state->has_custom_escape) {
683
0
            replaced_patterns = ColumnString::create();
684
0
            for (int i = 0; i < input_rows_count; ++i) {
685
0
                std::string val =
686
0
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
687
0
                replaced_patterns->insert_data(val.c_str(), val.size());
688
0
            }
689
0
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
690
307
        } else {
691
307
            vector_search_state = pattern_type_recognition<true>(patterns);
692
307
        }
693
307
    } else {
694
0
        vector_search_state = pattern_type_recognition<false>(patterns);
695
0
    }
696
697
307
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
698
699
307
    if (vector_search_state == nullptr) {
700
        // pattern type recognition failed, use default case
701
381
        for (int i = 0; i < input_rows_count; ++i) {
702
262
            const auto pattern_val = real_pattern.get_data_at(i);
703
262
            const auto value_val = values.get_data_at(i);
704
262
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
705
262
                                                     &result[i]));
706
262
        }
707
119
        return Status::OK();
708
119
    }
709
188
    const auto* search_strings =
710
188
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
711
188
    return (vector_search_state->_vector_function)(values, *search_strings, result);
712
307
}
713
714
Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val,
715
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
716
0
    std::string re_pattern;
717
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
718
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
719
0
}
720
721
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val,
722
209
                                    const StringRef& pattern, unsigned char* result) {
723
209
    std::string re_pattern;
724
209
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
725
726
209
    return regexp_fn_scalar(state, StringRef(val.data, val.size),
727
209
                            {re_pattern.c_str(), re_pattern.size()}, result);
728
209
}
729
730
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern,
731
319
                                        std::string* re_pattern) {
732
319
    re_pattern->clear();
733
734
319
    if (pattern.empty()) {
735
14
        re_pattern->append("^$");
736
14
        return;
737
14
    }
738
739
    // add ^ to pattern head to match line head
740
305
    if (!pattern.empty() && pattern[0] != '%') {
741
229
        re_pattern->append("^");
742
229
    }
743
744
    // expect % and _, all chars should keep it literal mean.
745
1.96k
    for (size_t i = 0; i < pattern.size(); i++) {
746
1.66k
        char c = pattern[i];
747
1.66k
        if (c == '\\' && i + 1 < pattern.size()) {
748
71
            char next_c = pattern[i + 1];
749
71
            if (next_c == '%' || next_c == '_') {
750
                // convert "\%" and "\_" to literal "%" and "_"
751
35
                re_pattern->append(1, next_c);
752
35
                i++;
753
35
                continue;
754
36
            } else if (next_c == '\\') {
755
                // keep valid escape "\\"
756
18
                re_pattern->append("\\\\");
757
18
                i++;
758
18
                continue;
759
18
            }
760
71
        }
761
762
1.61k
        if (c == '%') {
763
283
            re_pattern->append(".*");
764
1.32k
        } else if (c == '_') {
765
256
            re_pattern->append(".");
766
1.07k
        } else {
767
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
768
1.07k
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
769
1.07k
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
770
1.07k
                c == '.' || c == '$' || c == '?') {
771
37
                re_pattern->append(1, '\\');
772
37
            }
773
1.07k
            re_pattern->append(1, c);
774
1.07k
        }
775
1.61k
    }
776
777
    // add $ to pattern tail to match line tail
778
305
    if (!pattern.empty() && re_pattern->back() != '*') {
779
195
        re_pattern->append("$");
780
195
    }
781
305
}
782
783
415
void FunctionLike::remove_escape_character(std::string* search_string) {
784
415
    std::string tmp_search_string;
785
415
    tmp_search_string.swap(*search_string);
786
415
    int64_t len = tmp_search_string.length();
787
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
788
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
789
1.87k
    for (int i = 0; i < len;) {
790
1.46k
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
791
1.46k
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
792
44
             tmp_search_string[i + 1] == '\\')) {
793
36
            search_string->append(1, tmp_search_string[i + 1]);
794
36
            i += 2;
795
1.42k
        } else {
796
1.42k
            search_string->append(1, tmp_search_string[i]);
797
1.42k
            i++;
798
1.42k
        }
799
1.46k
    }
800
415
}
801
802
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
803
0
    if (!re.ok()) {
804
0
        return false;
805
0
    }
806
807
0
    std::vector<RE2::Arg> arguments;
808
0
    std::vector<RE2::Arg*> arguments_ptrs;
809
0
    std::size_t args_count = re.NumberOfCapturingGroups();
810
0
    arguments.resize(args_count);
811
0
    arguments_ptrs.resize(args_count);
812
0
    results.resize(args_count);
813
0
    for (std::size_t i = 0; i < args_count; ++i) {
814
0
        arguments[i] = &results[i];
815
0
        arguments_ptrs[i] = &arguments[i];
816
0
    }
817
818
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
819
0
}
820
821
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
822
0
    std::vector<std::string> results;
823
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
824
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
825
0
    if (re2_full_match(str, re, results)) {
826
0
        for (int i = 0; i < results.size(); ++i) {
827
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
828
0
        }
829
0
    } else {
830
0
        VLOG_DEBUG << "no match";
831
0
    }
832
0
}
833
834
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
835
                                                std::shared_ptr<LikeState>& state,
836
347
                                                bool try_hyperscan) {
837
347
    std::string pattern_str;
838
347
    if (state->has_custom_escape) {
839
1
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
840
346
    } else {
841
346
        pattern_str = pattern.to_string();
842
346
    }
843
347
    state->search_state.pattern_str = pattern_str;
844
347
    std::string search_string;
845
846
347
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
847
22
        state->search_state.set_search_string("");
848
22
        state->function = constant_allpass_fn;
849
22
        state->scalar_function = constant_allpass_fn_scalar;
850
325
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
851
95
        if (VLOG_DEBUG_IS_ON) {
852
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
853
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
854
0
        }
855
95
        remove_escape_character(&search_string);
856
95
        if (VLOG_DEBUG_IS_ON) {
857
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
858
0
                       << ", size: " << search_string.size();
859
0
        }
860
95
        state->search_state.set_search_string(search_string);
861
95
        state->function = constant_equals_fn;
862
95
        state->scalar_function = constant_equals_fn_scalar;
863
230
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
864
32
        if (VLOG_DEBUG_IS_ON) {
865
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
866
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
867
0
        }
868
32
        remove_escape_character(&search_string);
869
32
        if (VLOG_DEBUG_IS_ON) {
870
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
871
0
                       << ", size: " << search_string.size();
872
0
        }
873
32
        state->search_state.set_search_string(search_string);
874
32
        state->function = constant_starts_with_fn;
875
32
        state->scalar_function = constant_starts_with_fn_scalar;
876
198
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
877
32
        if (VLOG_DEBUG_IS_ON) {
878
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
879
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
880
0
        }
881
32
        remove_escape_character(&search_string);
882
32
        if (VLOG_DEBUG_IS_ON) {
883
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
884
0
                       << ", size: " << search_string.size();
885
0
        }
886
32
        state->search_state.set_search_string(search_string);
887
32
        state->function = constant_ends_with_fn;
888
32
        state->scalar_function = constant_ends_with_fn_scalar;
889
166
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
890
56
        if (VLOG_DEBUG_IS_ON) {
891
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
892
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
893
0
        }
894
56
        remove_escape_character(&search_string);
895
56
        if (VLOG_DEBUG_IS_ON) {
896
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
897
0
                       << ", size: " << search_string.size();
898
0
        }
899
56
        state->search_state.set_search_string(search_string);
900
56
        state->function = constant_substring_fn;
901
56
        state->scalar_function = constant_substring_fn_scalar;
902
110
    } else {
903
110
        std::string re_pattern;
904
110
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
905
110
        if (VLOG_DEBUG_IS_ON) {
906
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
907
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
908
0
                       << ", size: " << re_pattern.size();
909
0
        }
910
911
110
        hs_database_t* database = nullptr;
912
110
        hs_scratch_t* scratch = nullptr;
913
110
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
914
            // use hyperscan
915
110
            state->search_state.hs_database.reset(database);
916
110
            state->search_state.hs_scratch.reset(scratch);
917
110
        } else {
918
            // fallback to re2
919
            // reset hs_database to nullptr to indicate not use hyperscan
920
0
            state->search_state.hs_database.reset();
921
0
            state->search_state.hs_scratch.reset();
922
923
0
            RE2::Options opts;
924
0
            opts.set_never_nl(false);
925
0
            opts.set_dot_nl(true);
926
0
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
927
0
            if (!state->search_state.regex->ok()) {
928
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
929
0
                                             pattern_str);
930
0
            }
931
0
        }
932
933
110
        state->function = constant_regex_fn;
934
110
        state->scalar_function = constant_regex_fn_scalar;
935
110
    }
936
347
    return Status::OK();
937
347
}
938
939
1.07k
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
940
1.07k
    if (scope != FunctionContext::THREAD_LOCAL) {
941
536
        return Status::OK();
942
536
    }
943
536
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
944
536
    state->is_like_pattern = true;
945
536
    state->function = like_fn;
946
536
    state->scalar_function = like_fn_scalar;
947
536
    if (context->is_col_constant(2)) {
948
1
        state->has_custom_escape = true;
949
1
        const auto escape_col = context->get_constant_col(2)->column_ptr;
950
1
        const auto& escape = escape_col->get_data_at(0);
951
1
        if (escape.size != 1) {
952
0
            return Status::InternalError("Escape character must be a single character, got: {}",
953
0
                                         escape.to_string());
954
0
        }
955
1
        state->escape_char = escape.data[0];
956
1
    }
957
536
    if (context->is_col_constant(1)) {
958
347
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
959
347
        const auto& pattern = pattern_col->get_data_at(0);
960
347
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
961
347
    }
962
536
    context->set_function_state(scope, state);
963
964
536
    return Status::OK();
965
536
}
966
967
Status FunctionRegexpLike::open(FunctionContext* context,
968
122
                                FunctionContext::FunctionStateScope scope) {
969
122
    if (scope != FunctionContext::THREAD_LOCAL) {
970
61
        return Status::OK();
971
61
    }
972
61
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
973
61
    context->set_function_state(scope, state);
974
61
    state->is_like_pattern = false;
975
61
    state->function = regexp_fn;
976
61
    state->scalar_function = regexp_fn_scalar;
977
61
    if (context->is_col_constant(1)) {
978
61
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
979
61
        const auto& pattern = pattern_col->get_data_at(0);
980
981
61
        std::string pattern_str = pattern.to_string();
982
61
        std::string search_string;
983
61
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
984
4
            state->search_state.set_search_string("");
985
4
            state->function = constant_allpass_fn;
986
4
            state->scalar_function = constant_allpass_fn_scalar;
987
57
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
988
7
            state->search_state.set_search_string(search_string);
989
7
            state->function = constant_equals_fn;
990
7
            state->scalar_function = constant_equals_fn_scalar;
991
50
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
992
7
            state->search_state.set_search_string(search_string);
993
7
            state->function = constant_starts_with_fn;
994
7
            state->scalar_function = constant_starts_with_fn_scalar;
995
43
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
996
7
            state->search_state.set_search_string(search_string);
997
7
            state->function = constant_ends_with_fn;
998
7
            state->scalar_function = constant_ends_with_fn_scalar;
999
36
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
1000
14
            state->search_state.set_search_string(search_string);
1001
14
            state->function = constant_substring_fn;
1002
14
            state->scalar_function = constant_substring_fn_scalar;
1003
22
        } else {
1004
22
            hs_database_t* database = nullptr;
1005
22
            hs_scratch_t* scratch = nullptr;
1006
22
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
1007
                // use hyperscan
1008
22
                state->search_state.hs_database.reset(database);
1009
22
                state->search_state.hs_scratch.reset(scratch);
1010
22
            } else {
1011
                // fallback to re2
1012
                // reset hs_database to nullptr to indicate not use hyperscan
1013
0
                state->search_state.hs_database.reset();
1014
0
                state->search_state.hs_scratch.reset();
1015
0
                RE2::Options opts;
1016
0
                opts.set_never_nl(false);
1017
0
                opts.set_dot_nl(true);
1018
0
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1019
0
                if (!state->search_state.regex->ok()) {
1020
0
                    if (!context->state()->enable_extended_regex()) {
1021
0
                        return Status::InternalError(
1022
0
                                "Invalid regex expression: {}. Error: {}. If you need advanced "
1023
0
                                "regex features, try setting enable_extended_regex=true",
1024
0
                                pattern_str, state->search_state.regex->error());
1025
0
                    }
1026
1027
                    // RE2 failed, fallback to Boost.Regex
1028
                    // This handles advanced regex features like zero-width assertions
1029
0
                    state->search_state.regex.reset();
1030
0
                    try {
1031
0
                        state->search_state.boost_regex =
1032
0
                                std::make_unique<boost::regex>(pattern_str);
1033
0
                    } catch (const boost::regex_error& e) {
1034
0
                        return Status::InternalError("Invalid regex expression: {}. Error: {}",
1035
0
                                                     pattern_str, e.what());
1036
0
                    }
1037
0
                }
1038
0
            }
1039
22
            state->function = constant_regex_fn;
1040
22
            state->scalar_function = constant_regex_fn_scalar;
1041
22
        }
1042
61
    }
1043
61
    return Status::OK();
1044
61
}
1045
1046
1
void register_function_like(SimpleFunctionFactory& factory) {
1047
1
    factory.register_function<FunctionLike>();
1048
1
}
1049
1050
1
void register_function_regexp(SimpleFunctionFactory& factory) {
1051
1
    factory.register_function<FunctionRegexpLike>();
1052
1
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1053
1
}
1054
#include "common/compile_check_end.h"
1055
} // namespace doris::vectorized