Coverage Report

Created: 2026-04-16 21:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_split_by_regexp.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <fmt/format.h>
19
#include <glog/logging.h>
20
21
#include "common/status.h"
22
#include "core/column/column_array.h"
23
#include "core/column/column_const.h"
24
#include "core/data_type/data_type_array.h"
25
#include "core/data_type/data_type_number.h"
26
#include "core/data_type/data_type_string.h"
27
#include "core/types.h"
28
#include "exprs/function/function.h"
29
#include "exprs/function/simple_function_factory.h"
30
31
namespace doris {
32
33
struct Match {
34
    std::string::size_type offset;
35
    std::string::size_type length;
36
};
37
38
class RegexpSplit {
39
public:
40
    void init(re2::RE2* re2, int32_t max_splits);
41
    void set(const char* pos, const char* end);
42
    bool get(const char*& token_begin, const char*& token_end);
43
44
private:
45
    const char* _pos;
46
    const char* _end;
47
48
    std::int32_t _max_splits = 0;
49
    std::vector<Match> _matches;
50
    int32_t _splits;
51
    re2::RE2* _re2 = nullptr;
52
    unsigned _number_of_subpatterns = 0;
53
54
    unsigned match(const char* subject, size_t subject_size, std::vector<Match>& matches,
55
                   unsigned limit) const;
56
};
57
58
unsigned RegexpSplit::match(const char* subject, size_t subject_size, std::vector<Match>& matches,
59
0
                            unsigned limit) const {
60
0
    matches.clear();
61
62
0
    if (limit == 0) {
63
0
        return 0;
64
0
    }
65
66
0
    limit = std::min(limit, _number_of_subpatterns + 1);
67
0
    std::vector<re2::StringPiece> pieces(limit);
68
69
0
    if (!_re2->Match({subject, subject_size}, 0, subject_size, re2::RE2::UNANCHORED, pieces.data(),
70
0
                     limit)) {
71
0
        return 0;
72
0
    } else {
73
0
        matches.resize(limit);
74
0
        for (size_t i = 0; i < limit; ++i) {
75
0
            if (pieces[i].empty()) {
76
0
                matches[i].offset = std::string::npos;
77
0
                matches[i].length = 0;
78
0
            } else {
79
0
                matches[i].offset = pieces[i].data() - subject;
80
0
                matches[i].length = pieces[i].length();
81
0
            }
82
0
        }
83
0
        return limit;
84
0
    }
85
0
}
86
87
0
void RegexpSplit::init(re2::RE2* re2, int32_t max_splits) {
88
0
    _max_splits = max_splits;
89
0
    _re2 = re2;
90
0
    if (_re2) {
91
0
        _number_of_subpatterns = _re2->NumberOfCapturingGroups();
92
0
    }
93
0
}
94
95
// Called for each next string.
96
0
void RegexpSplit::set(const char* pos, const char* end) {
97
0
    _pos = pos;
98
0
    _end = end;
99
0
    _splits = 0;
100
0
}
101
102
// Get the next token, if any, or return false.
103
0
bool RegexpSplit::get(const char*& token_begin, const char*& token_end) {
104
0
    if (!_re2) {
105
0
        if (_pos == _end) {
106
0
            return false;
107
0
        }
108
109
0
        token_begin = _pos;
110
0
        if (_max_splits != -1) {
111
0
            if (_splits == _max_splits - 1) {
112
0
                token_end = _end;
113
0
                _pos = _end;
114
0
                return true;
115
0
            }
116
0
        }
117
118
0
        _pos += 1;
119
0
        token_end = _pos;
120
0
        ++_splits;
121
0
    } else {
122
0
        if (!_pos || _pos > _end) {
123
0
            return false;
124
0
        }
125
126
0
        token_begin = _pos;
127
0
        if (_max_splits != -1) {
128
0
            if (_splits == _max_splits - 1) {
129
0
                token_end = _end;
130
0
                _pos = nullptr;
131
0
                return true;
132
0
            }
133
0
        }
134
135
0
        if (!match(_pos, _end - _pos, _matches, _number_of_subpatterns + 1) ||
136
0
            !_matches[0].length) {
137
0
            token_end = _end;
138
0
            _pos = _end + 1;
139
0
        } else {
140
0
            token_end = _pos + _matches[0].offset;
141
0
            _pos = token_end + _matches[0].length;
142
0
            ++_splits;
143
0
        }
144
0
    }
145
146
0
    return true;
147
0
}
148
149
template <typename Impl>
150
class SplitByRegexp : public IFunction {
151
public:
152
    static constexpr auto name = "split_by_regexp";
153
154
4
    static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); }
_ZN5doris13SplitByRegexpINS_15TwoArgumentImplEE6createEv
Line
Count
Source
154
2
    static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); }
_ZN5doris13SplitByRegexpINS_17ThreeArgumentImplEE6createEv
Line
Count
Source
154
2
    static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); }
155
156
0
    String get_name() const override { return name; }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE8get_nameB5cxx11Ev
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE8get_nameB5cxx11Ev
157
158
0
    size_t get_number_of_arguments() const override {
159
0
        return get_variadic_argument_types_impl().size();
160
0
    }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE23get_number_of_argumentsEv
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE23get_number_of_argumentsEv
161
162
2
    bool is_variadic() const override { return true; }
_ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE11is_variadicEv
Line
Count
Source
162
1
    bool is_variadic() const override { return true; }
_ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE11is_variadicEv
Line
Count
Source
162
1
    bool is_variadic() const override { return true; }
163
164
2
    DataTypes get_variadic_argument_types_impl() const override {
165
2
        return Impl::get_variadic_argument_types();
166
2
    }
_ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE32get_variadic_argument_types_implEv
Line
Count
Source
164
1
    DataTypes get_variadic_argument_types_impl() const override {
165
1
        return Impl::get_variadic_argument_types();
166
1
    }
_ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE32get_variadic_argument_types_implEv
Line
Count
Source
164
1
    DataTypes get_variadic_argument_types_impl() const override {
165
1
        return Impl::get_variadic_argument_types();
166
1
    }
167
168
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
169
0
        DCHECK(is_string_type(arguments[0]->get_primitive_type()))
170
0
                << "first argument for function: " << name << " should be string"
171
0
                << " and arguments[0] is " << arguments[0]->get_name();
172
0
        DCHECK(is_string_type(arguments[1]->get_primitive_type()))
173
0
                << "second argument for function: " << name << " should be string"
174
0
                << " and arguments[1] is " << arguments[1]->get_name();
175
0
        auto nullable_string_type = make_nullable(std::make_shared<DataTypeString>());
176
0
        return std::make_shared<DataTypeArray>(nullable_string_type);
177
0
    }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
178
179
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
180
0
                        uint32_t result, size_t input_rows_count) const override {
181
0
        return Impl::execute_impl(context, block, arguments, result, input_rows_count);
182
0
    }
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
183
};
184
185
struct ExecuteImpl {
186
    using NullMapType = PaddedPODArray<UInt8>;
187
    static Status execute_impl(FunctionContext* context, Block& block,
188
                               const ColumnNumbers& arguments, uint32_t result,
189
0
                               size_t input_rows_count) {
190
0
        const auto& [first_column, left_const] =
191
0
                unpack_if_const(block.get_by_position(arguments[0]).column);
192
0
        const auto& [second_column, right_const] =
193
0
                unpack_if_const(block.get_by_position(arguments[1]).column);
194
0
        const auto& [three_column, three_is_const] =
195
0
                unpack_if_const(block.get_by_position(arguments[2]).column);
196
0
        auto limit_value = assert_cast<const ColumnInt32&>(*three_column).get_element(0);
197
0
        const auto& src_column = assert_cast<const ColumnString&>(*first_column);
198
0
        const auto& pattern_column = assert_cast<const ColumnString&>(*second_column);
199
200
0
        auto nullable_string_type = make_nullable(std::make_shared<DataTypeString>());
201
0
        auto dest_column_ptr = ColumnArray::create(nullable_string_type->create_column(),
202
0
                                                   ColumnArray::ColumnOffsets::create());
203
0
        IColumn* dest_nested_column = &dest_column_ptr->get_data();
204
0
        auto& dest_offsets = dest_column_ptr->get_offsets();
205
0
        DCHECK(dest_nested_column != nullptr);
206
207
0
        NullMapType* dest_nested_null_map = nullptr;
208
0
        auto* dest_nullable_col = assert_cast<ColumnNullable*>(dest_nested_column);
209
0
        auto& dest_column_string =
210
0
                assert_cast<ColumnString&>(*(dest_nullable_col->get_nested_column_ptr()));
211
0
        dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data();
212
0
        RE2::Options opts;
213
0
        opts.set_never_nl(false);
214
0
        opts.set_dot_nl(true);
215
        // split_by_regexp(ColumnString, "xxx")
216
0
        if (right_const) {
217
0
            RETURN_IF_ERROR(_execute_constant_pattern(
218
0
                    src_column, pattern_column.get_data_at(0), dest_column_string, dest_offsets,
219
0
                    dest_nested_null_map, limit_value, input_rows_count, &opts));
220
0
        } else if (left_const) {
221
            // split_by_regexp("xxx", ColumnString)
222
0
            _execute_constant_src_string(src_column.get_data_at(0), pattern_column,
223
0
                                         dest_column_string, dest_offsets, dest_nested_null_map,
224
0
                                         limit_value, input_rows_count, &opts);
225
0
        } else {
226
            // split_by_regexp(ColumnString, ColumnString)
227
0
            _execute_vector_vector(src_column, pattern_column, dest_column_string, dest_offsets,
228
0
                                   dest_nested_null_map, limit_value, input_rows_count, &opts);
229
0
        }
230
231
0
        block.replace_by_position(result, std::move(dest_column_ptr));
232
0
        return Status::OK();
233
0
    }
234
235
private:
236
    static Status _execute_constant_pattern(const ColumnString& src_column_string,
237
                                            const StringRef& pattern_ref,
238
                                            ColumnString& dest_column_string,
239
                                            ColumnArray::Offsets64& dest_offsets,
240
                                            NullMapType* dest_nested_null_map, Int32 limit_value,
241
0
                                            size_t input_rows_count, RE2::Options* opts) {
242
0
        const char* token_begin = nullptr;
243
0
        const char* token_end = nullptr;
244
0
        UInt64 index = 0;
245
0
        std::unique_ptr<re2::RE2> re2_ptr = nullptr;
246
0
        if (pattern_ref.size) {
247
0
            re2_ptr = std::make_unique<re2::RE2>(pattern_ref.to_string_view(), *opts);
248
0
        }
249
0
        RegexpSplit RegexpSplit;
250
0
        RegexpSplit.init(re2_ptr.get(), limit_value);
251
0
        for (int row = 0; row < input_rows_count; ++row) {
252
0
            auto str_data = src_column_string.get_data_at(row);
253
0
            RegexpSplit.set(str_data.begin(), str_data.end());
254
0
            while (RegexpSplit.get(token_begin, token_end)) {
255
0
                size_t token_size = token_end - token_begin;
256
0
                dest_column_string.insert_data(token_begin, token_size);
257
0
                dest_nested_null_map->push_back(false);
258
0
                index += 1;
259
0
            }
260
0
            dest_offsets.push_back(index);
261
0
        }
262
0
        return Status::OK();
263
0
    }
264
265
    static void _execute_constant_src_string(const StringRef& str_ref,
266
                                             const ColumnString& pattern_column,
267
                                             ColumnString& dest_column_string,
268
                                             ColumnArray::Offsets64& dest_offsets,
269
                                             NullMapType* dest_nested_null_map, Int32 limit_value,
270
0
                                             size_t input_rows_count, RE2::Options* opts) {
271
0
        const char* token_begin = nullptr;
272
0
        const char* token_end = nullptr;
273
0
        UInt64 index = 0;
274
0
        RegexpSplit RegexpSplit;
275
276
0
        for (int row = 0; row < input_rows_count; ++row) {
277
0
            std::unique_ptr<re2::RE2> re2_ptr = nullptr;
278
0
            auto pattern = pattern_column.get_data_at(row);
279
0
            if (pattern.size) {
280
0
                re2_ptr = std::make_unique<re2::RE2>(pattern.to_string_view(), *opts);
281
0
                if (!re2_ptr->ok()) {
282
0
                    dest_column_string.insert_default();
283
0
                    dest_nested_null_map->push_back(true);
284
0
                    index += 1;
285
0
                    dest_offsets.push_back(index);
286
0
                    continue;
287
0
                }
288
0
            }
289
290
0
            RegexpSplit.init(re2_ptr.get(), limit_value);
291
0
            RegexpSplit.set(str_ref.begin(), str_ref.end());
292
0
            while (RegexpSplit.get(token_begin, token_end)) {
293
0
                size_t token_size = token_end - token_begin;
294
0
                dest_column_string.insert_data(token_begin, token_size);
295
0
                dest_nested_null_map->push_back(false);
296
0
                index += 1;
297
0
            }
298
0
            dest_offsets.push_back(index);
299
0
        }
300
0
    }
301
302
    static void _execute_vector_vector(const ColumnString& src_column_string,
303
                                       const ColumnString& pattern_column,
304
                                       ColumnString& dest_column_string,
305
                                       ColumnArray::Offsets64& dest_offsets,
306
                                       NullMapType* dest_nested_null_map, Int32 limit_value,
307
0
                                       size_t input_rows_count, RE2::Options* opts) {
308
0
        const char* token_begin = nullptr;
309
0
        const char* token_end = nullptr;
310
0
        UInt64 index = 0;
311
0
        RegexpSplit RegexpSplit;
312
313
0
        for (int row = 0; row < input_rows_count; ++row) {
314
0
            std::unique_ptr<re2::RE2> re2_ptr = nullptr;
315
0
            auto str_data = src_column_string.get_data_at(row);
316
0
            auto pattern = pattern_column.get_data_at(row);
317
0
            if (pattern.size) {
318
0
                re2_ptr = std::make_unique<re2::RE2>(pattern.to_string_view(), *opts);
319
0
                if (!re2_ptr->ok()) {
320
0
                    dest_column_string.insert_default();
321
0
                    dest_nested_null_map->push_back(true);
322
0
                    index += 1;
323
0
                    dest_offsets.push_back(index);
324
0
                    continue;
325
0
                }
326
0
            }
327
0
            RegexpSplit.init(re2_ptr.get(), limit_value);
328
0
            RegexpSplit.set(str_data.begin(), str_data.end());
329
0
            while (RegexpSplit.get(token_begin, token_end)) {
330
0
                size_t token_size = token_end - token_begin;
331
0
                dest_column_string.insert_data(token_begin, token_size);
332
0
                dest_nested_null_map->push_back(false);
333
0
                index += 1;
334
0
            }
335
0
            dest_offsets.push_back(index);
336
0
        }
337
0
    }
338
};
339
340
struct TwoArgumentImpl {
341
1
    static DataTypes get_variadic_argument_types() {
342
1
        return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()};
343
1
    }
344
345
    static Status execute_impl(FunctionContext* context, Block& block,
346
                               const ColumnNumbers& arguments, uint32_t result,
347
0
                               size_t input_rows_count) {
348
0
        DCHECK_EQ(arguments.size(), 2);
349
0
        auto max_limit = ColumnConst::create(ColumnInt32::create(1, -1), input_rows_count);
350
0
        block.insert({std::move(max_limit), std::make_shared<DataTypeInt32>(), "max_limit"});
351
0
        ColumnNumbers temp_arguments = {arguments[0], arguments[1], block.columns() - 1};
352
0
        return ExecuteImpl::execute_impl(context, block, temp_arguments, result, input_rows_count);
353
0
    }
354
};
355
356
struct ThreeArgumentImpl {
357
1
    static DataTypes get_variadic_argument_types() {
358
1
        return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
359
1
                std::make_shared<DataTypeInt32>()};
360
1
    }
361
    static Status execute_impl(FunctionContext* context, Block& block,
362
                               const ColumnNumbers& arguments, uint32_t result,
363
0
                               size_t input_rows_count) {
364
0
        DCHECK_EQ(arguments.size(), 3);
365
0
        return ExecuteImpl::execute_impl(context, block, arguments, result, input_rows_count);
366
0
    }
367
};
368
369
1
void register_function_split_by_regexp(SimpleFunctionFactory& factory) {
370
1
    factory.register_function<SplitByRegexp<TwoArgumentImpl>>();
371
1
    factory.register_function<SplitByRegexp<ThreeArgumentImpl>>();
372
1
}
373
374
} // namespace doris