Coverage Report

Created: 2026-03-16 01:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_split_by_regexp.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <fmt/format.h>
19
#include <glog/logging.h>
20
21
#include "common/status.h"
22
#include "core/column/column_array.h"
23
#include "core/column/column_const.h"
24
#include "core/data_type/data_type_array.h"
25
#include "core/data_type/data_type_number.h"
26
#include "core/data_type/data_type_string.h"
27
#include "core/types.h"
28
#include "exprs/function/function.h"
29
#include "exprs/function/function_string.h"
30
#include "exprs/function/simple_function_factory.h"
31
32
namespace doris {
33
#include "common/compile_check_begin.h"
34
35
struct Match {
36
    std::string::size_type offset;
37
    std::string::size_type length;
38
};
39
40
class RegexpSplit {
41
public:
42
    void init(re2::RE2* re2, int32_t max_splits);
43
    void set(const char* pos, const char* end);
44
    bool get(const char*& token_begin, const char*& token_end);
45
46
private:
47
    const char* _pos;
48
    const char* _end;
49
50
    std::int32_t _max_splits = 0;
51
    std::vector<Match> _matches;
52
    int32_t _splits;
53
    re2::RE2* _re2 = nullptr;
54
    unsigned _number_of_subpatterns = 0;
55
56
    unsigned match(const char* subject, size_t subject_size, std::vector<Match>& matches,
57
                   unsigned limit) const;
58
};
59
60
unsigned RegexpSplit::match(const char* subject, size_t subject_size, std::vector<Match>& matches,
61
0
                            unsigned limit) const {
62
0
    matches.clear();
63
64
0
    if (limit == 0) {
65
0
        return 0;
66
0
    }
67
68
0
    limit = std::min(limit, _number_of_subpatterns + 1);
69
0
    std::vector<re2::StringPiece> pieces(limit);
70
71
0
    if (!_re2->Match({subject, subject_size}, 0, subject_size, re2::RE2::UNANCHORED, pieces.data(),
72
0
                     limit)) {
73
0
        return 0;
74
0
    } else {
75
0
        matches.resize(limit);
76
0
        for (size_t i = 0; i < limit; ++i) {
77
0
            if (pieces[i].empty()) {
78
0
                matches[i].offset = std::string::npos;
79
0
                matches[i].length = 0;
80
0
            } else {
81
0
                matches[i].offset = pieces[i].data() - subject;
82
0
                matches[i].length = pieces[i].length();
83
0
            }
84
0
        }
85
0
        return limit;
86
0
    }
87
0
}
88
89
0
void RegexpSplit::init(re2::RE2* re2, int32_t max_splits) {
90
0
    _max_splits = max_splits;
91
0
    _re2 = re2;
92
0
    if (_re2) {
93
0
        _number_of_subpatterns = _re2->NumberOfCapturingGroups();
94
0
    }
95
0
}
96
97
// Called for each next string.
98
0
void RegexpSplit::set(const char* pos, const char* end) {
99
0
    _pos = pos;
100
0
    _end = end;
101
0
    _splits = 0;
102
0
}
103
104
// Get the next token, if any, or return false.
105
0
bool RegexpSplit::get(const char*& token_begin, const char*& token_end) {
106
0
    if (!_re2) {
107
0
        if (_pos == _end) {
108
0
            return false;
109
0
        }
110
111
0
        token_begin = _pos;
112
0
        if (_max_splits != -1) {
113
0
            if (_splits == _max_splits - 1) {
114
0
                token_end = _end;
115
0
                _pos = _end;
116
0
                return true;
117
0
            }
118
0
        }
119
120
0
        _pos += 1;
121
0
        token_end = _pos;
122
0
        ++_splits;
123
0
    } else {
124
0
        if (!_pos || _pos > _end) {
125
0
            return false;
126
0
        }
127
128
0
        token_begin = _pos;
129
0
        if (_max_splits != -1) {
130
0
            if (_splits == _max_splits - 1) {
131
0
                token_end = _end;
132
0
                _pos = nullptr;
133
0
                return true;
134
0
            }
135
0
        }
136
137
0
        if (!match(_pos, _end - _pos, _matches, _number_of_subpatterns + 1) ||
138
0
            !_matches[0].length) {
139
0
            token_end = _end;
140
0
            _pos = _end + 1;
141
0
        } else {
142
0
            token_end = _pos + _matches[0].offset;
143
0
            _pos = token_end + _matches[0].length;
144
0
            ++_splits;
145
0
        }
146
0
    }
147
148
0
    return true;
149
0
}
150
151
template <typename Impl>
152
class SplitByRegexp : public IFunction {
153
public:
154
    static constexpr auto name = "split_by_regexp";
155
156
4
    static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); }
_ZN5doris13SplitByRegexpINS_15TwoArgumentImplEE6createEv
Line
Count
Source
156
2
    static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); }
_ZN5doris13SplitByRegexpINS_17ThreeArgumentImplEE6createEv
Line
Count
Source
156
2
    static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); }
157
158
0
    String get_name() const override { return name; }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE8get_nameB5cxx11Ev
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE8get_nameB5cxx11Ev
159
160
0
    size_t get_number_of_arguments() const override {
161
0
        return get_variadic_argument_types_impl().size();
162
0
    }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE23get_number_of_argumentsEv
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE23get_number_of_argumentsEv
163
164
2
    bool is_variadic() const override { return true; }
_ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE11is_variadicEv
Line
Count
Source
164
1
    bool is_variadic() const override { return true; }
_ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE11is_variadicEv
Line
Count
Source
164
1
    bool is_variadic() const override { return true; }
165
166
2
    DataTypes get_variadic_argument_types_impl() const override {
167
2
        return Impl::get_variadic_argument_types();
168
2
    }
_ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE32get_variadic_argument_types_implEv
Line
Count
Source
166
1
    DataTypes get_variadic_argument_types_impl() const override {
167
1
        return Impl::get_variadic_argument_types();
168
1
    }
_ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE32get_variadic_argument_types_implEv
Line
Count
Source
166
1
    DataTypes get_variadic_argument_types_impl() const override {
167
1
        return Impl::get_variadic_argument_types();
168
1
    }
169
170
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
171
0
        DCHECK(is_string_type(arguments[0]->get_primitive_type()))
172
0
                << "first argument for function: " << name << " should be string"
173
0
                << " and arguments[0] is " << arguments[0]->get_name();
174
0
        DCHECK(is_string_type(arguments[1]->get_primitive_type()))
175
0
                << "second argument for function: " << name << " should be string"
176
0
                << " and arguments[1] is " << arguments[1]->get_name();
177
0
        auto nullable_string_type = make_nullable(std::make_shared<DataTypeString>());
178
0
        return std::make_shared<DataTypeArray>(nullable_string_type);
179
0
    }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
180
181
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
182
0
                        uint32_t result, size_t input_rows_count) const override {
183
0
        return Impl::execute_impl(context, block, arguments, result, input_rows_count);
184
0
    }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
185
};
186
187
struct ExecuteImpl {
188
    using NullMapType = PaddedPODArray<UInt8>;
189
    static Status execute_impl(FunctionContext* context, Block& block,
190
                               const ColumnNumbers& arguments, uint32_t result,
191
0
                               size_t input_rows_count) {
192
0
        const auto& [first_column, left_const] =
193
0
                unpack_if_const(block.get_by_position(arguments[0]).column);
194
0
        const auto& [second_column, right_const] =
195
0
                unpack_if_const(block.get_by_position(arguments[1]).column);
196
0
        const auto& [three_column, three_is_const] =
197
0
                unpack_if_const(block.get_by_position(arguments[2]).column);
198
0
        auto limit_value = assert_cast<const ColumnInt32&>(*three_column).get_element(0);
199
0
        const auto& src_column = assert_cast<const ColumnString&>(*first_column);
200
0
        const auto& pattern_column = assert_cast<const ColumnString&>(*second_column);
201
202
0
        auto nullable_string_type = make_nullable(std::make_shared<DataTypeString>());
203
0
        auto dest_column_ptr = ColumnArray::create(nullable_string_type->create_column(),
204
0
                                                   ColumnArray::ColumnOffsets::create());
205
0
        IColumn* dest_nested_column = &dest_column_ptr->get_data();
206
0
        auto& dest_offsets = dest_column_ptr->get_offsets();
207
0
        DCHECK(dest_nested_column != nullptr);
208
209
0
        NullMapType* dest_nested_null_map = nullptr;
210
0
        auto* dest_nullable_col = assert_cast<ColumnNullable*>(dest_nested_column);
211
0
        auto& dest_column_string =
212
0
                assert_cast<ColumnString&>(*(dest_nullable_col->get_nested_column_ptr()));
213
0
        dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data();
214
0
        RE2::Options opts;
215
0
        opts.set_never_nl(false);
216
0
        opts.set_dot_nl(true);
217
        // split_by_regexp(ColumnString, "xxx")
218
0
        if (right_const) {
219
0
            RETURN_IF_ERROR(_execute_constant_pattern(
220
0
                    src_column, pattern_column.get_data_at(0), dest_column_string, dest_offsets,
221
0
                    dest_nested_null_map, limit_value, input_rows_count, &opts));
222
0
        } else if (left_const) {
223
            // split_by_regexp("xxx", ColumnString)
224
0
            _execute_constant_src_string(src_column.get_data_at(0), pattern_column,
225
0
                                         dest_column_string, dest_offsets, dest_nested_null_map,
226
0
                                         limit_value, input_rows_count, &opts);
227
0
        } else {
228
            // split_by_regexp(ColumnString, ColumnString)
229
0
            _execute_vector_vector(src_column, pattern_column, dest_column_string, dest_offsets,
230
0
                                   dest_nested_null_map, limit_value, input_rows_count, &opts);
231
0
        }
232
233
0
        block.replace_by_position(result, std::move(dest_column_ptr));
234
0
        return Status::OK();
235
0
    }
236
237
private:
238
    static Status _execute_constant_pattern(const ColumnString& src_column_string,
239
                                            const StringRef& pattern_ref,
240
                                            ColumnString& dest_column_string,
241
                                            ColumnArray::Offsets64& dest_offsets,
242
                                            NullMapType* dest_nested_null_map, Int32 limit_value,
243
0
                                            size_t input_rows_count, RE2::Options* opts) {
244
0
        const char* token_begin = nullptr;
245
0
        const char* token_end = nullptr;
246
0
        UInt64 index = 0;
247
0
        std::unique_ptr<re2::RE2> re2_ptr = nullptr;
248
0
        if (pattern_ref.size) {
249
0
            re2_ptr = std::make_unique<re2::RE2>(pattern_ref.to_string_view(), *opts);
250
0
        }
251
0
        RegexpSplit RegexpSplit;
252
0
        RegexpSplit.init(re2_ptr.get(), limit_value);
253
0
        for (int row = 0; row < input_rows_count; ++row) {
254
0
            auto str_data = src_column_string.get_data_at(row);
255
0
            RegexpSplit.set(str_data.begin(), str_data.end());
256
0
            while (RegexpSplit.get(token_begin, token_end)) {
257
0
                size_t token_size = token_end - token_begin;
258
0
                dest_column_string.insert_data(token_begin, token_size);
259
0
                dest_nested_null_map->push_back(false);
260
0
                index += 1;
261
0
            }
262
0
            dest_offsets.push_back(index);
263
0
        }
264
0
        return Status::OK();
265
0
    }
266
267
    static void _execute_constant_src_string(const StringRef& str_ref,
268
                                             const ColumnString& pattern_column,
269
                                             ColumnString& dest_column_string,
270
                                             ColumnArray::Offsets64& dest_offsets,
271
                                             NullMapType* dest_nested_null_map, Int32 limit_value,
272
0
                                             size_t input_rows_count, RE2::Options* opts) {
273
0
        const char* token_begin = nullptr;
274
0
        const char* token_end = nullptr;
275
0
        UInt64 index = 0;
276
0
        RegexpSplit RegexpSplit;
277
278
0
        for (int row = 0; row < input_rows_count; ++row) {
279
0
            std::unique_ptr<re2::RE2> re2_ptr = nullptr;
280
0
            auto pattern = pattern_column.get_data_at(row);
281
0
            if (pattern.size) {
282
0
                re2_ptr = std::make_unique<re2::RE2>(pattern.to_string_view(), *opts);
283
0
                if (!re2_ptr->ok()) {
284
0
                    dest_column_string.insert_default();
285
0
                    dest_nested_null_map->push_back(true);
286
0
                    index += 1;
287
0
                    dest_offsets.push_back(index);
288
0
                    continue;
289
0
                }
290
0
            }
291
292
0
            RegexpSplit.init(re2_ptr.get(), limit_value);
293
0
            RegexpSplit.set(str_ref.begin(), str_ref.end());
294
0
            while (RegexpSplit.get(token_begin, token_end)) {
295
0
                size_t token_size = token_end - token_begin;
296
0
                dest_column_string.insert_data(token_begin, token_size);
297
0
                dest_nested_null_map->push_back(false);
298
0
                index += 1;
299
0
            }
300
0
            dest_offsets.push_back(index);
301
0
        }
302
0
    }
303
304
    static void _execute_vector_vector(const ColumnString& src_column_string,
305
                                       const ColumnString& pattern_column,
306
                                       ColumnString& dest_column_string,
307
                                       ColumnArray::Offsets64& dest_offsets,
308
                                       NullMapType* dest_nested_null_map, Int32 limit_value,
309
0
                                       size_t input_rows_count, RE2::Options* opts) {
310
0
        const char* token_begin = nullptr;
311
0
        const char* token_end = nullptr;
312
0
        UInt64 index = 0;
313
0
        RegexpSplit RegexpSplit;
314
315
0
        for (int row = 0; row < input_rows_count; ++row) {
316
0
            std::unique_ptr<re2::RE2> re2_ptr = nullptr;
317
0
            auto str_data = src_column_string.get_data_at(row);
318
0
            auto pattern = pattern_column.get_data_at(row);
319
0
            if (pattern.size) {
320
0
                re2_ptr = std::make_unique<re2::RE2>(pattern.to_string_view(), *opts);
321
0
                if (!re2_ptr->ok()) {
322
0
                    dest_column_string.insert_default();
323
0
                    dest_nested_null_map->push_back(true);
324
0
                    index += 1;
325
0
                    dest_offsets.push_back(index);
326
0
                    continue;
327
0
                }
328
0
            }
329
0
            RegexpSplit.init(re2_ptr.get(), limit_value);
330
0
            RegexpSplit.set(str_data.begin(), str_data.end());
331
0
            while (RegexpSplit.get(token_begin, token_end)) {
332
0
                size_t token_size = token_end - token_begin;
333
0
                dest_column_string.insert_data(token_begin, token_size);
334
0
                dest_nested_null_map->push_back(false);
335
0
                index += 1;
336
0
            }
337
0
            dest_offsets.push_back(index);
338
0
        }
339
0
    }
340
};
341
342
struct TwoArgumentImpl {
343
1
    static DataTypes get_variadic_argument_types() {
344
1
        return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()};
345
1
    }
346
347
    static Status execute_impl(FunctionContext* context, Block& block,
348
                               const ColumnNumbers& arguments, uint32_t result,
349
0
                               size_t input_rows_count) {
350
0
        DCHECK_EQ(arguments.size(), 2);
351
0
        auto max_limit = ColumnConst::create(ColumnInt32::create(1, -1), input_rows_count);
352
0
        block.insert({std::move(max_limit), std::make_shared<DataTypeInt32>(), "max_limit"});
353
0
        ColumnNumbers temp_arguments = {arguments[0], arguments[1], block.columns() - 1};
354
0
        return ExecuteImpl::execute_impl(context, block, temp_arguments, result, input_rows_count);
355
0
    }
356
};
357
358
struct ThreeArgumentImpl {
359
1
    static DataTypes get_variadic_argument_types() {
360
1
        return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
361
1
                std::make_shared<DataTypeInt32>()};
362
1
    }
363
    static Status execute_impl(FunctionContext* context, Block& block,
364
                               const ColumnNumbers& arguments, uint32_t result,
365
0
                               size_t input_rows_count) {
366
0
        DCHECK_EQ(arguments.size(), 3);
367
0
        return ExecuteImpl::execute_impl(context, block, arguments, result, input_rows_count);
368
0
    }
369
};
370
371
1
void register_function_split_by_regexp(SimpleFunctionFactory& factory) {
372
1
    factory.register_function<SplitByRegexp<TwoArgumentImpl>>();
373
1
    factory.register_function<SplitByRegexp<ThreeArgumentImpl>>();
374
1
}
375
376
} // namespace doris