Coverage Report

Created: 2026-06-10 15:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_string_search.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <cstddef>
19
#include <cstring>
20
#include <numeric>
21
#include <string>
22
#include <string_view>
23
#include <vector>
24
25
#include "common/status.h"
26
#include "core/assert_cast.h"
27
#include "core/block/block.h"
28
#include "core/block/column_numbers.h"
29
#include "core/column/column_array.h"
30
#include "core/column/column_const.h"
31
#include "core/column/column_nullable.h"
32
#include "core/column/column_string.h"
33
#include "core/column/column_vector.h"
34
#include "core/data_type/data_type_array.h"
35
#include "core/data_type/data_type_nullable.h"
36
#include "core/data_type/data_type_number.h"
37
#include "core/data_type/data_type_string.h"
38
#include "core/data_type/define_primitive_type.h"
39
#include "core/memcmp_small.h"
40
#include "core/memcpy_small.h"
41
#include "core/pod_array_fwd.h"
42
#include "core/string_ref.h"
43
#include "exec/common/stringop_substring.h"
44
#include "exec/common/template_helpers.hpp"
45
#include "exec/common/util.hpp"
46
#include "exprs/function/function.h"
47
#include "exprs/function/function_helpers.h"
48
#include "exprs/function/simple_function_factory.h"
49
#include "exprs/function_context.h"
50
#include "util/simd/vstring_function.h"
51
#include "util/string_search.hpp"
52
53
namespace doris {
54
#include "common/compile_check_avoid_begin.h"
55
56
class FunctionStringLocatePos : public IFunction {
57
public:
58
    static constexpr auto name = "locate";
59
816
    static FunctionPtr create() { return std::make_shared<FunctionStringLocatePos>(); }
60
0
    String get_name() const override { return name; }
61
0
    size_t get_number_of_arguments() const override { return 3; }
62
63
814
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
64
814
        return std::make_shared<DataTypeInt32>();
65
814
    }
66
67
1
    DataTypes get_variadic_argument_types_impl() const override {
68
1
        return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
69
1
                std::make_shared<DataTypeInt32>()};
70
1
    }
71
72
815
    bool is_variadic() const override { return true; }
73
74
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
75
576
                        uint32_t result, size_t input_rows_count) const override {
76
576
        if (arguments.size() != 3) {
77
0
            return Status::InvalidArgument("Function {} requires 3 arguments, but got {}",
78
0
                                           get_name(), arguments.size());
79
0
        }
80
576
        bool col_const[3];
81
576
        ColumnPtr argument_columns[3];
82
2.30k
        for (int i = 0; i < 3; ++i) {
83
1.72k
            std::tie(argument_columns[i], col_const[i]) =
84
1.72k
                    unpack_if_const(block.get_by_position(arguments[i]).column);
85
1.72k
        }
86
87
576
        const auto* col_left = assert_cast<const ColumnString*>(argument_columns[0].get());
88
576
        const auto* col_right = assert_cast<const ColumnString*>(argument_columns[1].get());
89
576
        const auto* col_pos = assert_cast<const ColumnInt32*>(argument_columns[2].get());
90
91
576
        ColumnInt32::MutablePtr col_res = ColumnInt32::create();
92
576
        auto& vec_res = col_res->get_data();
93
576
        vec_res.resize(block.rows());
94
95
576
        const bool is_ascii = col_left->is_ascii() && col_right->is_ascii();
96
97
576
        if (col_const[0]) {
98
246
            std::visit(
99
246
                    [&](auto is_ascii, auto str_const, auto pos_const) {
100
246
                        scalar_search<is_ascii, str_const, pos_const>(
101
246
                                col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res,
102
246
                                input_rows_count);
103
246
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESG_SG_EEDaSA_SB_SC_
Line
Count
Source
99
22
                    [&](auto is_ascii, auto str_const, auto pos_const) {
100
22
                        scalar_search<is_ascii, str_const, pos_const>(
101
22
                                col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res,
102
22
                                input_rows_count);
103
22
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESG_SF_IbLb1EEEEDaSA_SB_SC_
Line
Count
Source
99
22
                    [&](auto is_ascii, auto str_const, auto pos_const) {
100
22
                        scalar_search<is_ascii, str_const, pos_const>(
101
22
                                col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res,
102
22
                                input_rows_count);
103
22
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESF_IbLb1EESG_EEDaSA_SB_SC_
Line
Count
Source
99
22
                    [&](auto is_ascii, auto str_const, auto pos_const) {
100
22
                        scalar_search<is_ascii, str_const, pos_const>(
101
22
                                col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res,
102
22
                                input_rows_count);
103
22
                    },
Unexecuted instantiation: _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESF_IbLb1EESH_EEDaSA_SB_SC_
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESF_IbLb0EESH_EEDaSA_SB_SC_
Line
Count
Source
99
60
                    [&](auto is_ascii, auto str_const, auto pos_const) {
100
60
                        scalar_search<is_ascii, str_const, pos_const>(
101
60
                                col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res,
102
60
                                input_rows_count);
103
60
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESF_IbLb0EESG_EEDaSA_SB_SC_
Line
Count
Source
99
60
                    [&](auto is_ascii, auto str_const, auto pos_const) {
100
60
                        scalar_search<is_ascii, str_const, pos_const>(
101
60
                                col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res,
102
60
                                input_rows_count);
103
60
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESG_SF_IbLb0EEEEDaSA_SB_SC_
Line
Count
Source
99
60
                    [&](auto is_ascii, auto str_const, auto pos_const) {
100
60
                        scalar_search<is_ascii, str_const, pos_const>(
101
60
                                col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res,
102
60
                                input_rows_count);
103
60
                    },
Unexecuted instantiation: _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESG_SG_EEDaSA_SB_SC_
104
246
                    make_bool_variant(is_ascii), make_bool_variant(col_const[1]),
105
246
                    make_bool_variant(col_const[2]));
106
107
330
        } else {
108
330
            std::visit(
109
330
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
330
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
330
                                                                      col_pos->get_data(), vec_res,
112
330
                                                                      input_rows_count);
113
330
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESG_SG_EEDaSA_SB_SC_
Line
Count
Source
109
23
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
23
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
23
                                                                      col_pos->get_data(), vec_res,
112
23
                                                                      input_rows_count);
113
23
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESG_SF_IbLb1EEEEDaSA_SB_SC_
Line
Count
Source
109
22
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
22
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
22
                                                                      col_pos->get_data(), vec_res,
112
22
                                                                      input_rows_count);
113
22
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESF_IbLb1EESG_EEDaSA_SB_SC_
Line
Count
Source
109
22
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
22
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
22
                                                                      col_pos->get_data(), vec_res,
112
22
                                                                      input_rows_count);
113
22
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESF_IbLb1EESH_EEDaSA_SB_SC_
Line
Count
Source
109
22
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
22
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
22
                                                                      col_pos->get_data(), vec_res,
112
22
                                                                      input_rows_count);
113
22
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESF_IbLb0EESH_EEDaSA_SB_SC_
Line
Count
Source
109
61
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
61
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
61
                                                                      col_pos->get_data(), vec_res,
112
61
                                                                      input_rows_count);
113
61
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESF_IbLb0EESG_EEDaSA_SB_SC_
Line
Count
Source
109
60
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
60
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
60
                                                                      col_pos->get_data(), vec_res,
112
60
                                                                      input_rows_count);
113
60
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESG_SF_IbLb0EEEEDaSA_SB_SC_
Line
Count
Source
109
60
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
60
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
60
                                                                      col_pos->get_data(), vec_res,
112
60
                                                                      input_rows_count);
113
60
                    },
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESG_SG_EEDaSA_SB_SC_
Line
Count
Source
109
60
                    [&](auto is_ascii, auto str_const, auto pos_const) {
110
60
                        vector_search<is_ascii, str_const, pos_const>(col_left, col_right,
111
60
                                                                      col_pos->get_data(), vec_res,
112
60
                                                                      input_rows_count);
113
60
                    },
114
330
                    make_bool_variant(is_ascii), make_bool_variant(col_const[1]),
115
330
                    make_bool_variant(col_const[2]));
116
330
        }
117
576
        block.replace_by_position(result, std::move(col_res));
118
576
        return Status::OK();
119
576
    }
120
121
private:
122
    template <bool is_ascii, bool str_const, bool pos_const>
123
    void scalar_search(const StringRef& ldata, const ColumnString* col_right,
124
                       const PaddedPODArray<Int32>& posdata, PaddedPODArray<Int32>& res,
125
246
                       size_t size) const {
126
246
        res.resize(size);
127
246
        StringRef substr(ldata.data, ldata.size);
128
246
        StringSearch search {&substr};
129
130
492
        for (int i = 0; i < size; ++i) {
131
246
            res[i] = locate_pos<is_ascii>(substr,
132
246
                                          col_right->get_data_at(index_check_const<str_const>(i)),
133
246
                                          search, posdata[index_check_const<pos_const>(i)]);
134
246
        }
135
246
    }
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb0ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
Line
Count
Source
125
22
                       size_t size) const {
126
22
        res.resize(size);
127
22
        StringRef substr(ldata.data, ldata.size);
128
22
        StringSearch search {&substr};
129
130
44
        for (int i = 0; i < size; ++i) {
131
22
            res[i] = locate_pos<is_ascii>(substr,
132
22
                                          col_right->get_data_at(index_check_const<str_const>(i)),
133
22
                                          search, posdata[index_check_const<pos_const>(i)]);
134
22
        }
135
22
    }
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb0ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
Line
Count
Source
125
22
                       size_t size) const {
126
22
        res.resize(size);
127
22
        StringRef substr(ldata.data, ldata.size);
128
22
        StringSearch search {&substr};
129
130
44
        for (int i = 0; i < size; ++i) {
131
22
            res[i] = locate_pos<is_ascii>(substr,
132
22
                                          col_right->get_data_at(index_check_const<str_const>(i)),
133
22
                                          search, posdata[index_check_const<pos_const>(i)]);
134
22
        }
135
22
    }
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb1ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
Line
Count
Source
125
22
                       size_t size) const {
126
22
        res.resize(size);
127
22
        StringRef substr(ldata.data, ldata.size);
128
22
        StringSearch search {&substr};
129
130
44
        for (int i = 0; i < size; ++i) {
131
22
            res[i] = locate_pos<is_ascii>(substr,
132
22
                                          col_right->get_data_at(index_check_const<str_const>(i)),
133
22
                                          search, posdata[index_check_const<pos_const>(i)]);
134
22
        }
135
22
    }
Unexecuted instantiation: _ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb1ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb0ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
Line
Count
Source
125
60
                       size_t size) const {
126
60
        res.resize(size);
127
60
        StringRef substr(ldata.data, ldata.size);
128
60
        StringSearch search {&substr};
129
130
120
        for (int i = 0; i < size; ++i) {
131
60
            res[i] = locate_pos<is_ascii>(substr,
132
60
                                          col_right->get_data_at(index_check_const<str_const>(i)),
133
60
                                          search, posdata[index_check_const<pos_const>(i)]);
134
60
        }
135
60
    }
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb0ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
Line
Count
Source
125
60
                       size_t size) const {
126
60
        res.resize(size);
127
60
        StringRef substr(ldata.data, ldata.size);
128
60
        StringSearch search {&substr};
129
130
120
        for (int i = 0; i < size; ++i) {
131
60
            res[i] = locate_pos<is_ascii>(substr,
132
60
                                          col_right->get_data_at(index_check_const<str_const>(i)),
133
60
                                          search, posdata[index_check_const<pos_const>(i)]);
134
60
        }
135
60
    }
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb1ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
Line
Count
Source
125
60
                       size_t size) const {
126
60
        res.resize(size);
127
60
        StringRef substr(ldata.data, ldata.size);
128
60
        StringSearch search {&substr};
129
130
120
        for (int i = 0; i < size; ++i) {
131
60
            res[i] = locate_pos<is_ascii>(substr,
132
60
                                          col_right->get_data_at(index_check_const<str_const>(i)),
133
60
                                          search, posdata[index_check_const<pos_const>(i)]);
134
60
        }
135
60
    }
Unexecuted instantiation: _ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb1ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m
136
137
    template <bool is_ascii, bool str_const, bool pos_const>
138
    void vector_search(const ColumnString* col_left, const ColumnString* col_right,
139
                       const PaddedPODArray<Int32>& posdata, PaddedPODArray<Int32>& res,
140
330
                       size_t size) const {
141
330
        res.resize(size);
142
330
        StringSearch search;
143
774
        for (int i = 0; i < size; ++i) {
144
444
            StringRef substr = col_left->get_data_at(i);
145
444
            search.set_pattern(&substr);
146
444
            res[i] = locate_pos<is_ascii>(substr,
147
444
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
444
                                          search, posdata[index_check_const<pos_const>(i)]);
149
444
        }
150
330
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb0ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
23
                       size_t size) const {
141
23
        res.resize(size);
142
23
        StringSearch search;
143
71
        for (int i = 0; i < size; ++i) {
144
48
            StringRef substr = col_left->get_data_at(i);
145
48
            search.set_pattern(&substr);
146
48
            res[i] = locate_pos<is_ascii>(substr,
147
48
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
48
                                          search, posdata[index_check_const<pos_const>(i)]);
149
48
        }
150
23
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb0ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
22
                       size_t size) const {
141
22
        res.resize(size);
142
22
        StringSearch search;
143
44
        for (int i = 0; i < size; ++i) {
144
22
            StringRef substr = col_left->get_data_at(i);
145
22
            search.set_pattern(&substr);
146
22
            res[i] = locate_pos<is_ascii>(substr,
147
22
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
22
                                          search, posdata[index_check_const<pos_const>(i)]);
149
22
        }
150
22
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb1ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
22
                       size_t size) const {
141
22
        res.resize(size);
142
22
        StringSearch search;
143
44
        for (int i = 0; i < size; ++i) {
144
22
            StringRef substr = col_left->get_data_at(i);
145
22
            search.set_pattern(&substr);
146
22
            res[i] = locate_pos<is_ascii>(substr,
147
22
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
22
                                          search, posdata[index_check_const<pos_const>(i)]);
149
22
        }
150
22
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb1ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
22
                       size_t size) const {
141
22
        res.resize(size);
142
22
        StringSearch search;
143
44
        for (int i = 0; i < size; ++i) {
144
22
            StringRef substr = col_left->get_data_at(i);
145
22
            search.set_pattern(&substr);
146
22
            res[i] = locate_pos<is_ascii>(substr,
147
22
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
22
                                          search, posdata[index_check_const<pos_const>(i)]);
149
22
        }
150
22
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb0ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
61
                       size_t size) const {
141
61
        res.resize(size);
142
61
        StringSearch search;
143
211
        for (int i = 0; i < size; ++i) {
144
150
            StringRef substr = col_left->get_data_at(i);
145
150
            search.set_pattern(&substr);
146
150
            res[i] = locate_pos<is_ascii>(substr,
147
150
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
150
                                          search, posdata[index_check_const<pos_const>(i)]);
149
150
        }
150
61
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb0ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
60
                       size_t size) const {
141
60
        res.resize(size);
142
60
        StringSearch search;
143
120
        for (int i = 0; i < size; ++i) {
144
60
            StringRef substr = col_left->get_data_at(i);
145
60
            search.set_pattern(&substr);
146
60
            res[i] = locate_pos<is_ascii>(substr,
147
60
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
60
                                          search, posdata[index_check_const<pos_const>(i)]);
149
60
        }
150
60
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb1ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
60
                       size_t size) const {
141
60
        res.resize(size);
142
60
        StringSearch search;
143
120
        for (int i = 0; i < size; ++i) {
144
60
            StringRef substr = col_left->get_data_at(i);
145
60
            search.set_pattern(&substr);
146
60
            res[i] = locate_pos<is_ascii>(substr,
147
60
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
60
                                          search, posdata[index_check_const<pos_const>(i)]);
149
60
        }
150
60
    }
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb1ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m
Line
Count
Source
140
60
                       size_t size) const {
141
60
        res.resize(size);
142
60
        StringSearch search;
143
120
        for (int i = 0; i < size; ++i) {
144
60
            StringRef substr = col_left->get_data_at(i);
145
60
            search.set_pattern(&substr);
146
60
            res[i] = locate_pos<is_ascii>(substr,
147
60
                                          col_right->get_data_at(index_check_const<str_const>(i)),
148
60
                                          search, posdata[index_check_const<pos_const>(i)]);
149
60
        }
150
60
    }
151
152
    template <bool is_ascii>
153
690
    int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const {
154
690
        if (str.size == 0 && substr.size == 0 && start_pos == 1) {
155
            // BEHAVIOR COMPATIBLE WITH MYSQL
156
            // locate('','')  locate('','',1) locate('','',2)
157
            // 1  1 0
158
11
            return 1;
159
11
        }
160
679
        if (is_ascii) {
161
499
            return locate_pos_ascii(substr, str, search, start_pos);
162
499
        } else {
163
180
            return locate_pos_utf8(substr, str, search, start_pos);
164
180
        }
165
679
    }
_ZNK5doris23FunctionStringLocatePos10locate_posILb0EEEiNS_9StringRefES2_RNS_12StringSearchEi
Line
Count
Source
153
180
    int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const {
154
180
        if (str.size == 0 && substr.size == 0 && start_pos == 1) {
155
            // BEHAVIOR COMPATIBLE WITH MYSQL
156
            // locate('','')  locate('','',1) locate('','',2)
157
            // 1  1 0
158
0
            return 1;
159
0
        }
160
180
        if (is_ascii) {
161
0
            return locate_pos_ascii(substr, str, search, start_pos);
162
180
        } else {
163
180
            return locate_pos_utf8(substr, str, search, start_pos);
164
180
        }
165
180
    }
_ZNK5doris23FunctionStringLocatePos10locate_posILb1EEEiNS_9StringRefES2_RNS_12StringSearchEi
Line
Count
Source
153
510
    int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const {
154
510
        if (str.size == 0 && substr.size == 0 && start_pos == 1) {
155
            // BEHAVIOR COMPATIBLE WITH MYSQL
156
            // locate('','')  locate('','',1) locate('','',2)
157
            // 1  1 0
158
11
            return 1;
159
11
        }
160
499
        if (is_ascii) {
161
499
            return locate_pos_ascii(substr, str, search, start_pos);
162
499
        } else {
163
0
            return locate_pos_utf8(substr, str, search, start_pos);
164
0
        }
165
499
    }
166
167
    int locate_pos_utf8(StringRef substr, StringRef str, StringSearch& search,
168
180
                        int start_pos) const {
169
180
        std::vector<size_t> index;
170
180
        size_t char_len = simd::VStringFunctions::get_char_len(str.data, str.size, index);
171
180
        if (start_pos <= 0 || start_pos > char_len) {
172
43
            return 0;
173
43
        }
174
137
        if (substr.size == 0) {
175
17
            return start_pos;
176
17
        }
177
        // Input start_pos starts from 1.
178
120
        StringRef adjusted_str(str.data + index[start_pos - 1], str.size - index[start_pos - 1]);
179
120
        int32_t match_pos = search.search(&adjusted_str);
180
120
        if (match_pos >= 0) {
181
            // Hive returns the position in the original string starting from 1.
182
104
            return start_pos + simd::VStringFunctions::get_char_len(adjusted_str.data, match_pos);
183
104
        } else {
184
16
            return 0;
185
16
        }
186
120
    }
187
188
    int locate_pos_ascii(StringRef substr, StringRef str, StringSearch& search,
189
499
                         int start_pos) const {
190
499
        if (start_pos <= 0 || start_pos > str.size) {
191
367
            return 0;
192
367
        }
193
132
        if (substr.size == 0) {
194
36
            return start_pos;
195
36
        }
196
        // Input start_pos starts from 1.
197
96
        StringRef adjusted_str(str.data + start_pos - 1, str.size - start_pos + 1);
198
96
        int32_t match_pos = search.search(&adjusted_str);
199
96
        if (match_pos >= 0) {
200
            // Hive returns the position in the original string starting from 1.
201
40
            return start_pos + match_pos;
202
56
        } else {
203
56
            return 0;
204
56
        }
205
96
    }
206
};
207
208
class FunctionSplitPart : public IFunction {
209
public:
210
    static constexpr auto name = "split_part";
211
2
    static FunctionPtr create() { return std::make_shared<FunctionSplitPart>(); }
212
1
    String get_name() const override { return name; }
213
0
    size_t get_number_of_arguments() const override { return 3; }
214
215
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
216
0
        return make_nullable(std::make_shared<DataTypeString>());
217
0
    }
218
219
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
220
0
                        uint32_t result, size_t input_rows_count) const override {
221
0
        DCHECK_EQ(arguments.size(), 3);
222
223
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
224
        // Create a zero column to simply implement
225
0
        auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
226
0
        auto res = ColumnString::create();
227
228
0
        auto& null_map_data = null_map->get_data();
229
0
        auto& res_offsets = res->get_offsets();
230
0
        auto& res_chars = res->get_chars();
231
0
        res_offsets.resize(input_rows_count);
232
233
0
        const size_t argument_size = arguments.size();
234
0
        std::vector<ColumnPtr> argument_columns(argument_size);
235
0
        for (size_t i = 0; i < argument_size; ++i) {
236
0
            argument_columns[i] =
237
0
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
238
0
            if (const auto* nullable =
239
0
                        check_and_get_column<const ColumnNullable>(*argument_columns[i])) {
240
                // Danger: Here must dispose the null map data first! Because
241
                // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem
242
                // of column nullable mem of null map
243
0
                VectorizedUtils::update_null_map(null_map->get_data(),
244
0
                                                 nullable->get_null_map_data());
245
0
                argument_columns[i] = nullable->get_nested_column_ptr();
246
0
            }
247
0
        }
248
249
0
        const auto* str_col = assert_cast<const ColumnString*>(argument_columns[0].get());
250
251
0
        const auto* delimiter_col = assert_cast<const ColumnString*>(argument_columns[1].get());
252
253
0
        const auto* part_num_col = assert_cast<const ColumnInt32*>(argument_columns[2].get());
254
0
        const auto& part_num_col_data = part_num_col->get_data();
255
256
0
        for (size_t i = 0; i < input_rows_count; ++i) {
257
0
            if (part_num_col_data[i] == 0) {
258
0
                StringOP::push_null_string(i, res_chars, res_offsets, null_map_data);
259
0
                continue;
260
0
            }
261
262
0
            auto delimiter = delimiter_col->get_data_at(i);
263
0
            auto delimiter_str = delimiter_col->get_data_at(i).to_string();
264
0
            auto part_number = part_num_col_data[i];
265
0
            auto str = str_col->get_data_at(i);
266
0
            if (delimiter.size == 0) {
267
0
                StringOP::push_empty_string(i, res_chars, res_offsets);
268
0
                continue;
269
0
            }
270
271
0
            if (part_number > 0) {
272
0
                if (delimiter.size == 1) {
273
                    // If delimiter is a char, use memchr to split
274
0
                    int32_t pre_offset = -1;
275
0
                    int32_t offset = -1;
276
0
                    int32_t num = 0;
277
0
                    while (num < part_number) {
278
0
                        pre_offset = offset;
279
0
                        size_t n = str.size - offset - 1;
280
0
                        const char* pos = reinterpret_cast<const char*>(
281
0
                                memchr(str.data + offset + 1, delimiter_str[0], n));
282
0
                        if (pos != nullptr) {
283
0
                            offset = pos - str.data;
284
0
                            num++;
285
0
                        } else {
286
0
                            offset = str.size;
287
0
                            num = (num == 0) ? 0 : num + 1;
288
0
                            break;
289
0
                        }
290
0
                    }
291
292
0
                    if (num == part_number) {
293
0
                        StringOP::push_value_string(
294
0
                                std::string_view {
295
0
                                        reinterpret_cast<const char*>(str.data + pre_offset + 1),
296
0
                                        (size_t)offset - pre_offset - 1},
297
0
                                i, res_chars, res_offsets);
298
0
                    } else {
299
0
                        StringOP::push_null_string(i, res_chars, res_offsets, null_map_data);
300
0
                    }
301
0
                } else {
302
                    // If delimiter is a string, use memmem to split
303
0
                    int32_t pre_offset = -delimiter.size;
304
0
                    int32_t offset = -delimiter.size;
305
0
                    int32_t num = 0;
306
0
                    while (num < part_number) {
307
0
                        pre_offset = offset;
308
0
                        size_t n = str.size - offset - delimiter.size;
309
0
                        char* pos =
310
0
                                reinterpret_cast<char*>(memmem(str.data + offset + delimiter.size,
311
0
                                                               n, delimiter.data, delimiter.size));
312
0
                        if (pos != nullptr) {
313
0
                            offset = pos - str.data;
314
0
                            num++;
315
0
                        } else {
316
0
                            offset = str.size;
317
0
                            num = (num == 0) ? 0 : num + 1;
318
0
                            break;
319
0
                        }
320
0
                    }
321
322
0
                    if (num == part_number) {
323
0
                        StringOP::push_value_string(
324
0
                                std::string_view {reinterpret_cast<const char*>(
325
0
                                                          str.data + pre_offset + delimiter.size),
326
0
                                                  (size_t)offset - pre_offset - delimiter.size},
327
0
                                i, res_chars, res_offsets);
328
0
                    } else {
329
0
                        StringOP::push_null_string(i, res_chars, res_offsets, null_map_data);
330
0
                    }
331
0
                }
332
0
            } else {
333
0
                part_number = -part_number;
334
0
                auto str_str = str.to_string();
335
0
                int32_t offset = str.size;
336
0
                int32_t pre_offset = offset;
337
0
                int32_t num = 0;
338
0
                auto substr = str_str;
339
0
                while (num <= part_number && offset >= 0) {
340
0
                    offset = (int)substr.rfind(delimiter, offset);
341
0
                    if (offset != -1) {
342
0
                        if (++num == part_number) {
343
0
                            break;
344
0
                        }
345
0
                        pre_offset = offset;
346
0
                        offset = offset - 1;
347
0
                        substr = str_str.substr(0, pre_offset);
348
0
                    } else {
349
0
                        break;
350
0
                    }
351
0
                }
352
0
                num = (offset == -1 && num != 0) ? num + 1 : num;
353
354
0
                if (num == part_number) {
355
0
                    if (offset == -1) {
356
0
                        StringOP::push_value_string(std::string_view {str.data, (size_t)pre_offset},
357
0
                                                    i, res_chars, res_offsets);
358
0
                    } else {
359
0
                        StringOP::push_value_string(
360
0
                                std::string_view {str_str.substr(
361
0
                                        offset + delimiter.size,
362
0
                                        (size_t)pre_offset - offset - delimiter.size)},
363
0
                                i, res_chars, res_offsets);
364
0
                    }
365
0
                } else {
366
0
                    StringOP::push_null_string(i, res_chars, res_offsets, null_map_data);
367
0
                }
368
0
            }
369
0
        }
370
371
0
        block.get_by_position(result).column =
372
0
                ColumnNullable::create(std::move(res), std::move(null_map));
373
0
        return Status::OK();
374
0
    }
375
};
376
377
class FunctionSubstringIndex : public IFunction {
378
public:
379
    static constexpr auto name = "substring_index";
380
2
    static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); }
381
1
    String get_name() const override { return name; }
382
0
    size_t get_number_of_arguments() const override { return 3; }
383
384
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
385
0
        return std::make_shared<DataTypeString>();
386
0
    }
387
388
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
389
0
                        uint32_t result, size_t input_rows_count) const override {
390
0
        DCHECK_EQ(arguments.size(), 3);
391
392
        // Create a zero column to simply implement
393
0
        auto res = ColumnString::create();
394
395
0
        auto& res_offsets = res->get_offsets();
396
0
        auto& res_chars = res->get_chars();
397
0
        res_offsets.resize(input_rows_count);
398
0
        ColumnPtr content_column;
399
0
        bool content_const = false;
400
0
        std::tie(content_column, content_const) =
401
0
                unpack_if_const(block.get_by_position(arguments[0]).column);
402
403
0
        const auto* str_col = assert_cast<const ColumnString*>(content_column.get());
404
405
        // Handle both constant and non-constant delimiter parameters
406
0
        ColumnPtr delimiter_column_ptr;
407
0
        bool delimiter_const = false;
408
0
        std::tie(delimiter_column_ptr, delimiter_const) =
409
0
                unpack_if_const(block.get_by_position(arguments[1]).column);
410
0
        const auto* delimiter_col = assert_cast<const ColumnString*>(delimiter_column_ptr.get());
411
412
0
        ColumnPtr part_num_column_ptr;
413
0
        bool part_num_const = false;
414
0
        std::tie(part_num_column_ptr, part_num_const) =
415
0
                unpack_if_const(block.get_by_position(arguments[2]).column);
416
0
        const ColumnInt32* part_num_col =
417
0
                assert_cast<const ColumnInt32*>(part_num_column_ptr.get());
418
419
        // For constant multi-character delimiters, create StringRef and StringSearch only once
420
0
        std::optional<StringRef> const_delimiter_ref;
421
0
        std::optional<StringSearch> const_search;
422
0
        if (delimiter_const && delimiter_col->get_data_at(0).size > 1) {
423
0
            const_delimiter_ref.emplace(delimiter_col->get_data_at(0));
424
0
            const_search.emplace(&const_delimiter_ref.value());
425
0
        }
426
427
0
        for (size_t i = 0; i < input_rows_count; ++i) {
428
0
            auto str = str_col->get_data_at(content_const ? 0 : i);
429
0
            auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i);
430
0
            int32_t delimiter_size = delimiter.size;
431
432
0
            auto part_number = part_num_col->get_element(part_num_const ? 0 : i);
433
434
0
            if (part_number == 0 || delimiter_size == 0) {
435
0
                StringOP::push_empty_string(i, res_chars, res_offsets);
436
0
                continue;
437
0
            }
438
439
0
            if (part_number > 0) {
440
0
                if (delimiter_size == 1) {
441
0
                    int32_t offset = -1;
442
0
                    int32_t num = 0;
443
0
                    while (num < part_number) {
444
0
                        size_t n = str.size - offset - 1;
445
0
                        const char* pos = reinterpret_cast<const char*>(
446
0
                                memchr(str.data + offset + 1, delimiter.data[0], n));
447
0
                        if (pos != nullptr) {
448
0
                            offset = pos - str.data;
449
0
                            num++;
450
0
                        } else {
451
0
                            offset = str.size;
452
0
                            num = (num == 0) ? 0 : num + 1;
453
0
                            break;
454
0
                        }
455
0
                    }
456
457
0
                    if (num == part_number) {
458
0
                        StringOP::push_value_string(std::string_view {str.data, (size_t)offset}, i,
459
0
                                                    res_chars, res_offsets);
460
0
                    } else {
461
0
                        StringOP::push_value_string(std::string_view(str.data, str.size), i,
462
0
                                                    res_chars, res_offsets);
463
0
                    }
464
0
                } else {
465
                    // For multi-character delimiters
466
                    // Use pre-created StringRef and StringSearch for constant delimiters
467
0
                    StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value()
468
0
                                                                  : StringRef(delimiter);
469
0
                    const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr;
470
0
                    StringSearch local_search(&delimiter_ref);
471
0
                    if (!search_ptr) {
472
0
                        search_ptr = &local_search;
473
0
                    }
474
475
0
                    int32_t offset = -delimiter_size;
476
0
                    int32_t num = 0;
477
0
                    while (num < part_number) {
478
0
                        size_t n = str.size - offset - delimiter_size;
479
                        // search first match delimter_ref index from src string among str_offset to end
480
0
                        const char* pos = search_ptr->search(str.data + offset + delimiter_size, n);
481
0
                        if (pos < str.data + str.size) {
482
0
                            offset = pos - str.data;
483
0
                            num++;
484
0
                        } else {
485
0
                            offset = str.size;
486
0
                            num = (num == 0) ? 0 : num + 1;
487
0
                            break;
488
0
                        }
489
0
                    }
490
491
0
                    if (num == part_number) {
492
0
                        StringOP::push_value_string(std::string_view {str.data, (size_t)offset}, i,
493
0
                                                    res_chars, res_offsets);
494
0
                    } else {
495
0
                        StringOP::push_value_string(std::string_view(str.data, str.size), i,
496
0
                                                    res_chars, res_offsets);
497
0
                    }
498
0
                }
499
0
            } else {
500
0
                int neg_part_number = -part_number;
501
0
                auto str_str = str.to_string();
502
0
                int32_t offset = str.size;
503
0
                int32_t pre_offset = offset;
504
0
                int32_t num = 0;
505
0
                auto substr = str_str;
506
507
                // Use pre-created StringRef for constant delimiters
508
0
                StringRef delimiter_str = const_delimiter_ref
509
0
                                                  ? const_delimiter_ref.value()
510
0
                                                  : StringRef(delimiter.data, delimiter.size);
511
512
0
                while (num <= neg_part_number && offset >= 0) {
513
0
                    offset = (int)substr.rfind(delimiter_str, offset);
514
0
                    if (offset != -1) {
515
0
                        if (++num == neg_part_number) {
516
0
                            break;
517
0
                        }
518
0
                        pre_offset = offset;
519
0
                        offset = offset - 1;
520
0
                        substr = str_str.substr(0, pre_offset);
521
0
                    } else {
522
0
                        break;
523
0
                    }
524
0
                }
525
0
                num = (offset == -1 && num != 0) ? num + 1 : num;
526
527
0
                if (num == neg_part_number) {
528
0
                    if (offset == -1) {
529
0
                        StringOP::push_value_string(std::string_view(str.data, str.size), i,
530
0
                                                    res_chars, res_offsets);
531
0
                    } else {
532
0
                        StringOP::push_value_string(
533
0
                                std::string_view {str.data + offset + delimiter_size,
534
0
                                                  str.size - offset - delimiter_size},
535
0
                                i, res_chars, res_offsets);
536
0
                    }
537
0
                } else {
538
0
                    StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars,
539
0
                                                res_offsets);
540
0
                }
541
0
            }
542
0
        }
543
544
0
        block.get_by_position(result).column = std::move(res);
545
0
        return Status::OK();
546
0
    }
547
};
548
549
class FunctionSplitByString : public IFunction {
550
public:
551
    static constexpr auto name = "split_by_string";
552
553
2
    static FunctionPtr create() { return std::make_shared<FunctionSplitByString>(); }
554
    using NullMapType = PaddedPODArray<UInt8>;
555
556
1
    String get_name() const override { return name; }
557
558
1
    bool is_variadic() const override { return false; }
559
560
0
    size_t get_number_of_arguments() const override { return 2; }
561
562
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
563
0
        DCHECK(is_string_type(arguments[0]->get_primitive_type()))
564
0
                << "first argument for function: " << name << " should be string"
565
0
                << " and arguments[0] is " << arguments[0]->get_name();
566
0
        DCHECK(is_string_type(arguments[1]->get_primitive_type()))
567
0
                << "second argument for function: " << name << " should be string"
568
0
                << " and arguments[1] is " << arguments[1]->get_name();
569
0
        return std::make_shared<DataTypeArray>(make_nullable(arguments[0]));
570
0
    }
571
572
    Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
573
0
                        uint32_t result, size_t input_rows_count) const override {
574
0
        DCHECK_EQ(arguments.size(), 2);
575
576
0
        const auto& [src_column, left_const] =
577
0
                unpack_if_const(block.get_by_position(arguments[0]).column);
578
0
        const auto& [right_column, right_const] =
579
0
                unpack_if_const(block.get_by_position(arguments[1]).column);
580
581
0
        DataTypePtr right_column_type = block.get_by_position(arguments[1]).type;
582
0
        DataTypePtr src_column_type = block.get_by_position(arguments[0]).type;
583
0
        auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(),
584
0
                                                   ColumnArray::ColumnOffsets::create());
585
586
0
        dest_column_ptr->resize(0);
587
0
        auto& dest_offsets = dest_column_ptr->get_offsets();
588
589
0
        auto& dest_nullable_col = assert_cast<ColumnNullable&>(dest_column_ptr->get_data());
590
0
        auto* dest_nested_column = dest_nullable_col.get_nested_column_ptr().get();
591
592
0
        const auto* col_str = assert_cast<const ColumnString*>(src_column.get());
593
594
0
        const auto* col_delimiter = assert_cast<const ColumnString*>(right_column.get());
595
596
0
        std::visit(
597
0
                [&](auto src_const, auto delimiter_const) {
598
0
                    _execute<src_const, delimiter_const>(*col_str, *col_delimiter,
599
0
                                                         *dest_nested_column, dest_offsets,
600
0
                                                         input_rows_count);
601
0
                },
Unexecuted instantiation: _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESF_EEDaSA_SB_
Unexecuted instantiation: _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESE_IbLb1EEEEDaSA_SB_
Unexecuted instantiation: _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESE_IbLb0EEEEDaSA_SB_
Unexecuted instantiation: _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESF_EEDaSA_SB_
602
0
                make_bool_variant(left_const), make_bool_variant(right_const));
603
604
        // all elements in dest_nested_column are not null
605
0
        dest_nullable_col.get_null_map_column().get_data().resize_fill(dest_nested_column->size(),
606
0
                                                                       false);
607
0
        block.replace_by_position(result, std::move(dest_column_ptr));
608
609
0
        return Status::OK();
610
0
    }
611
612
private:
613
    template <bool src_const, bool delimiter_const>
614
    void _execute(const ColumnString& src_column_string, const ColumnString& delimiter_column,
615
                  IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets,
616
0
                  size_t size) const {
617
0
        auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column);
618
0
        ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
619
0
        ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
620
0
        column_string_chars.reserve(0);
621
622
0
        ColumnArray::Offset64 string_pos = 0;
623
0
        ColumnArray::Offset64 dest_pos = 0;
624
625
0
        StringSearch search;
626
0
        StringRef delimiter_ref_for_search;
627
628
0
        if constexpr (delimiter_const) {
629
0
            delimiter_ref_for_search = delimiter_column.get_data_at(0);
630
0
            search.set_pattern(&delimiter_ref_for_search);
631
0
        }
632
633
0
        for (size_t i = 0; i < size; i++) {
634
0
            const StringRef str_ref =
635
0
                    src_column_string.get_data_at(index_check_const<src_const>(i));
636
0
            const StringRef delimiter_ref =
637
0
                    delimiter_column.get_data_at(index_check_const<delimiter_const>(i));
638
639
0
            if (str_ref.size == 0) {
640
0
                dest_offsets.push_back(dest_pos);
641
0
                continue;
642
0
            }
643
0
            if (delimiter_ref.size == 0) {
644
0
                split_empty_delimiter(str_ref, column_string_chars, column_string_offsets,
645
0
                                      string_pos, dest_pos);
646
0
            } else {
647
0
                if constexpr (!delimiter_const) {
648
0
                    search.set_pattern(&delimiter_ref);
649
0
                }
650
0
                for (size_t str_pos = 0; str_pos <= str_ref.size;) {
651
0
                    const size_t str_offset = str_pos;
652
0
                    const size_t old_size = column_string_chars.size();
653
                    // search first match delimter_ref index from src string among str_offset to end
654
0
                    const char* result_start =
655
0
                            search.search(str_ref.data + str_offset, str_ref.size - str_offset);
656
                    // compute split part size
657
0
                    const size_t split_part_size = result_start - str_ref.data - str_offset;
658
                    // save dist string split part
659
0
                    if (split_part_size > 0) {
660
0
                        const size_t new_size = old_size + split_part_size;
661
0
                        column_string_chars.resize(new_size);
662
0
                        memcpy_small_allow_read_write_overflow15(
663
0
                                column_string_chars.data() + old_size, str_ref.data + str_offset,
664
0
                                split_part_size);
665
                        // add dist string offset
666
0
                        string_pos += split_part_size;
667
0
                    }
668
0
                    column_string_offsets.push_back(string_pos);
669
                    // array offset + 1
670
0
                    dest_pos++;
671
                    // add src string str_pos to next search start
672
0
                    str_pos += split_part_size + delimiter_ref.size;
673
0
                }
674
0
            }
675
0
            dest_offsets.push_back(dest_pos);
676
0
        }
677
0
    }
Unexecuted instantiation: _ZNK5doris21FunctionSplitByString8_executeILb0ELb0EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Unexecuted instantiation: _ZNK5doris21FunctionSplitByString8_executeILb0ELb1EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Unexecuted instantiation: _ZNK5doris21FunctionSplitByString8_executeILb1ELb0EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Unexecuted instantiation: _ZNK5doris21FunctionSplitByString8_executeILb1ELb1EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
678
679
    void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars& column_string_chars,
680
                               ColumnString::Offsets& column_string_offsets,
681
                               ColumnArray::Offset64& string_pos,
682
0
                               ColumnArray::Offset64& dest_pos) const {
683
0
        const size_t old_size = column_string_chars.size();
684
0
        const size_t new_size = old_size + str_ref.size;
685
0
        column_string_chars.resize(new_size);
686
0
        memcpy(column_string_chars.data() + old_size, str_ref.data, str_ref.size);
687
0
        if (simd::VStringFunctions::is_ascii(str_ref)) {
688
0
            const auto size = str_ref.size;
689
690
0
            const auto nested_old_size = column_string_offsets.size();
691
0
            const auto nested_new_size = nested_old_size + size;
692
0
            column_string_offsets.resize(nested_new_size);
693
0
            std::iota(column_string_offsets.data() + nested_old_size,
694
0
                      column_string_offsets.data() + nested_new_size, string_pos + 1);
695
696
0
            string_pos += size;
697
0
            dest_pos += size;
698
            // The above code is equivalent to the code in the following comment.
699
            // for (size_t i = 0; i < str_ref.size; i++) {
700
            //     string_pos++;
701
            //     column_string_offsets.push_back(string_pos);
702
            //     (*dest_nested_null_map).push_back(false);
703
            //     dest_pos++;
704
            // }
705
0
        } else {
706
0
            for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += utf8_char_len) {
707
0
                utf8_char_len = UTF8_BYTE_LENGTH[(unsigned char)str_ref.data[i]];
708
709
0
                string_pos += utf8_char_len;
710
0
                column_string_offsets.push_back(string_pos);
711
0
                dest_pos++;
712
0
            }
713
0
        }
714
0
    }
715
};
716
717
enum class FunctionCountSubStringType { TWO_ARGUMENTS, THREE_ARGUMENTS };
718
719
template <FunctionCountSubStringType type>
720
class FunctionCountSubString : public IFunction {
721
public:
722
    static constexpr auto name = "count_substrings";
723
    static constexpr auto arg_count = (type == FunctionCountSubStringType::TWO_ARGUMENTS) ? 2 : 3;
724
725
223
    static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); }
_ZN5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE6createEv
Line
Count
Source
725
45
    static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); }
_ZN5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE6createEv
Line
Count
Source
725
178
    static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); }
726
    using NullMapType = PaddedPODArray<UInt8>;
727
728
0
    String get_name() const override { return name; }
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8get_nameB5cxx11Ev
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8get_nameB5cxx11Ev
729
730
0
    size_t get_number_of_arguments() const override { return arg_count; }
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE23get_number_of_argumentsEv
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE23get_number_of_argumentsEv
731
732
219
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
733
219
        return std::make_shared<DataTypeInt32>();
734
219
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Line
Count
Source
732
43
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
733
43
        return std::make_shared<DataTypeInt32>();
734
43
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Line
Count
Source
732
176
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
733
176
        return std::make_shared<DataTypeInt32>();
734
176
    }
735
736
2
    DataTypes get_variadic_argument_types_impl() const override {
737
2
        if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) {
738
1
            return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()};
739
1
        } else {
740
1
            return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
741
1
                    std::make_shared<DataTypeInt32>()};
742
1
        }
743
2
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE32get_variadic_argument_types_implEv
Line
Count
Source
736
1
    DataTypes get_variadic_argument_types_impl() const override {
737
1
        if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) {
738
1
            return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()};
739
        } else {
740
            return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
741
                    std::make_shared<DataTypeInt32>()};
742
        }
743
1
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE32get_variadic_argument_types_implEv
Line
Count
Source
736
1
    DataTypes get_variadic_argument_types_impl() const override {
737
        if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) {
738
            return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()};
739
1
        } else {
740
1
            return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
741
1
                    std::make_shared<DataTypeInt32>()};
742
1
        }
743
1
    }
744
745
221
    bool is_variadic() const override { return true; }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE11is_variadicEv
Line
Count
Source
745
44
    bool is_variadic() const override { return true; }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE11is_variadicEv
Line
Count
Source
745
177
    bool is_variadic() const override { return true; }
746
747
    Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
748
189
                        uint32_t result, size_t input_rows_count) const override {
749
189
        DCHECK(arg_count);
750
189
        bool col_const[arg_count];
751
189
        ColumnPtr argument_columns[arg_count];
752
722
        for (int i = 0; i < arg_count; ++i) {
753
533
            std::tie(argument_columns[i], col_const[i]) =
754
533
                    unpack_if_const(block.get_by_position(arguments[i]).column);
755
533
        }
756
757
189
        auto dest_column_ptr = ColumnInt32::create(input_rows_count);
758
189
        auto& dest_column_data = dest_column_ptr->get_data();
759
760
189
        if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) {
761
34
            const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]);
762
34
            const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]);
763
34
            std::visit(
764
34
                    [&](auto str_const, auto pattern_const) {
765
34
                        _execute<str_const, pattern_const>(src_column_string, pattern_column,
766
34
                                                           dest_column_data, input_rows_count);
767
34
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESH_EEDaSC_SD_
Line
Count
Source
764
12
                    [&](auto str_const, auto pattern_const) {
765
12
                        _execute<str_const, pattern_const>(src_column_string, pattern_column,
766
12
                                                           dest_column_data, input_rows_count);
767
12
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESG_IbLb1EEEEDaSC_SD_
Line
Count
Source
764
11
                    [&](auto str_const, auto pattern_const) {
765
11
                        _execute<str_const, pattern_const>(src_column_string, pattern_column,
766
11
                                                           dest_column_data, input_rows_count);
767
11
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESG_IbLb0EEEEDaSC_SD_
Line
Count
Source
764
11
                    [&](auto str_const, auto pattern_const) {
765
11
                        _execute<str_const, pattern_const>(src_column_string, pattern_column,
766
11
                                                           dest_column_data, input_rows_count);
767
11
                    },
Unexecuted instantiation: _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESH_EEDaSC_SD_
768
34
                    make_bool_variant(col_const[0]), make_bool_variant(col_const[1]));
769
155
        } else {
770
155
            const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]);
771
155
            const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]);
772
155
            const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]);
773
155
            std::visit(
774
155
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
155
                        _execute<str_const, pattern_const, start_pos_const>(
776
155
                                src_column_string, pattern_column, start_pos_column,
777
155
                                dest_column_data, input_rows_count);
778
155
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESI_SI_EEDaSC_SD_SE_
Line
Count
Source
774
23
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
23
                        _execute<str_const, pattern_const, start_pos_const>(
776
23
                                src_column_string, pattern_column, start_pos_column,
777
23
                                dest_column_data, input_rows_count);
778
23
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESI_SH_IbLb1EEEEDaSC_SD_SE_
Line
Count
Source
774
22
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
22
                        _execute<str_const, pattern_const, start_pos_const>(
776
22
                                src_column_string, pattern_column, start_pos_column,
777
22
                                dest_column_data, input_rows_count);
778
22
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESH_IbLb1EESI_EEDaSC_SD_SE_
Line
Count
Source
774
22
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
22
                        _execute<str_const, pattern_const, start_pos_const>(
776
22
                                src_column_string, pattern_column, start_pos_column,
777
22
                                dest_column_data, input_rows_count);
778
22
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESH_IbLb1EESJ_EEDaSC_SD_SE_
Line
Count
Source
774
22
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
22
                        _execute<str_const, pattern_const, start_pos_const>(
776
22
                                src_column_string, pattern_column, start_pos_column,
777
22
                                dest_column_data, input_rows_count);
778
22
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESH_IbLb0EESJ_EEDaSC_SD_SE_
Line
Count
Source
774
22
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
22
                        _execute<str_const, pattern_const, start_pos_const>(
776
22
                                src_column_string, pattern_column, start_pos_column,
777
22
                                dest_column_data, input_rows_count);
778
22
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESH_IbLb0EESI_EEDaSC_SD_SE_
Line
Count
Source
774
22
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
22
                        _execute<str_const, pattern_const, start_pos_const>(
776
22
                                src_column_string, pattern_column, start_pos_column,
777
22
                                dest_column_data, input_rows_count);
778
22
                    },
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESI_SH_IbLb0EEEEDaSC_SD_SE_
Line
Count
Source
774
22
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
22
                        _execute<str_const, pattern_const, start_pos_const>(
776
22
                                src_column_string, pattern_column, start_pos_column,
777
22
                                dest_column_data, input_rows_count);
778
22
                    },
Unexecuted instantiation: _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESI_SI_EEDaSC_SD_SE_
779
155
                    make_bool_variant(col_const[0]), make_bool_variant(col_const[1]),
780
155
                    make_bool_variant(col_const[2]));
781
155
        }
782
783
189
        block.replace_by_position(result, std::move(dest_column_ptr));
784
189
        return Status::OK();
785
189
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Line
Count
Source
748
34
                        uint32_t result, size_t input_rows_count) const override {
749
34
        DCHECK(arg_count);
750
34
        bool col_const[arg_count];
751
34
        ColumnPtr argument_columns[arg_count];
752
102
        for (int i = 0; i < arg_count; ++i) {
753
68
            std::tie(argument_columns[i], col_const[i]) =
754
68
                    unpack_if_const(block.get_by_position(arguments[i]).column);
755
68
        }
756
757
34
        auto dest_column_ptr = ColumnInt32::create(input_rows_count);
758
34
        auto& dest_column_data = dest_column_ptr->get_data();
759
760
34
        if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) {
761
34
            const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]);
762
34
            const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]);
763
34
            std::visit(
764
34
                    [&](auto str_const, auto pattern_const) {
765
34
                        _execute<str_const, pattern_const>(src_column_string, pattern_column,
766
34
                                                           dest_column_data, input_rows_count);
767
34
                    },
768
34
                    make_bool_variant(col_const[0]), make_bool_variant(col_const[1]));
769
        } else {
770
            const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]);
771
            const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]);
772
            const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]);
773
            std::visit(
774
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
                        _execute<str_const, pattern_const, start_pos_const>(
776
                                src_column_string, pattern_column, start_pos_column,
777
                                dest_column_data, input_rows_count);
778
                    },
779
                    make_bool_variant(col_const[0]), make_bool_variant(col_const[1]),
780
                    make_bool_variant(col_const[2]));
781
        }
782
783
34
        block.replace_by_position(result, std::move(dest_column_ptr));
784
34
        return Status::OK();
785
34
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Line
Count
Source
748
155
                        uint32_t result, size_t input_rows_count) const override {
749
155
        DCHECK(arg_count);
750
155
        bool col_const[arg_count];
751
155
        ColumnPtr argument_columns[arg_count];
752
620
        for (int i = 0; i < arg_count; ++i) {
753
465
            std::tie(argument_columns[i], col_const[i]) =
754
465
                    unpack_if_const(block.get_by_position(arguments[i]).column);
755
465
        }
756
757
155
        auto dest_column_ptr = ColumnInt32::create(input_rows_count);
758
155
        auto& dest_column_data = dest_column_ptr->get_data();
759
760
        if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) {
761
            const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]);
762
            const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]);
763
            std::visit(
764
                    [&](auto str_const, auto pattern_const) {
765
                        _execute<str_const, pattern_const>(src_column_string, pattern_column,
766
                                                           dest_column_data, input_rows_count);
767
                    },
768
                    make_bool_variant(col_const[0]), make_bool_variant(col_const[1]));
769
155
        } else {
770
155
            const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]);
771
155
            const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]);
772
155
            const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]);
773
155
            std::visit(
774
155
                    [&](auto str_const, auto pattern_const, auto start_pos_const) {
775
155
                        _execute<str_const, pattern_const, start_pos_const>(
776
155
                                src_column_string, pattern_column, start_pos_column,
777
155
                                dest_column_data, input_rows_count);
778
155
                    },
779
155
                    make_bool_variant(col_const[0]), make_bool_variant(col_const[1]),
780
155
                    make_bool_variant(col_const[2]));
781
155
        }
782
783
155
        block.replace_by_position(result, std::move(dest_column_ptr));
784
155
        return Status::OK();
785
155
    }
786
787
private:
788
    template <bool src_const, bool pattern_const>
789
    void _execute(const ColumnString& src_column_string, const ColumnString& pattern_column,
790
34
                  ColumnInt32::Container& dest_column_data, size_t size) const {
791
81
        for (size_t i = 0; i < size; i++) {
792
47
            const StringRef str_ref =
793
47
                    src_column_string.get_data_at(index_check_const<src_const>(i));
794
795
47
            const StringRef pattern_ref =
796
47
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
797
47
            dest_column_data[i] = find_str_count(str_ref, pattern_ref);
798
47
        }
799
34
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb0ELb0EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
790
12
                  ColumnInt32::Container& dest_column_data, size_t size) const {
791
37
        for (size_t i = 0; i < size; i++) {
792
25
            const StringRef str_ref =
793
25
                    src_column_string.get_data_at(index_check_const<src_const>(i));
794
795
25
            const StringRef pattern_ref =
796
25
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
797
25
            dest_column_data[i] = find_str_count(str_ref, pattern_ref);
798
25
        }
799
12
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb0ELb1EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
790
11
                  ColumnInt32::Container& dest_column_data, size_t size) const {
791
22
        for (size_t i = 0; i < size; i++) {
792
11
            const StringRef str_ref =
793
11
                    src_column_string.get_data_at(index_check_const<src_const>(i));
794
795
11
            const StringRef pattern_ref =
796
11
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
797
11
            dest_column_data[i] = find_str_count(str_ref, pattern_ref);
798
11
        }
799
11
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb1ELb0EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
790
11
                  ColumnInt32::Container& dest_column_data, size_t size) const {
791
22
        for (size_t i = 0; i < size; i++) {
792
11
            const StringRef str_ref =
793
11
                    src_column_string.get_data_at(index_check_const<src_const>(i));
794
795
11
            const StringRef pattern_ref =
796
11
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
797
11
            dest_column_data[i] = find_str_count(str_ref, pattern_ref);
798
11
        }
799
11
    }
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb1ELb1EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
800
801
    template <bool src_const, bool pattern_const, bool start_pos_const>
802
    void _execute(const ColumnString& src_column_string, const ColumnString& pattern_column,
803
                  const ColumnInt32& start_pos_column, ColumnInt32::Container& dest_column_data,
804
155
                  size_t size) const {
805
334
        for (size_t i = 0; i < size; i++) {
806
179
            const StringRef str_ref =
807
179
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
179
            const StringRef pattern_ref =
809
179
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
179
            int32_t start_pos =
812
179
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
179
            const char* p = str_ref.begin();
815
179
            const char* end = str_ref.end();
816
179
            int char_size = 0;
817
1.22k
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
1.04k
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
1.04k
            }
820
179
            const auto start_byte_len = p - str_ref.begin();
821
822
179
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
115
                dest_column_data[i] = 0;
824
115
            } else {
825
64
                dest_column_data[i] =
826
64
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
64
            }
828
179
        }
829
155
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb0ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
804
23
                  size_t size) const {
805
70
        for (size_t i = 0; i < size; i++) {
806
47
            const StringRef str_ref =
807
47
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
47
            const StringRef pattern_ref =
809
47
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
47
            int32_t start_pos =
812
47
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
47
            const char* p = str_ref.begin();
815
47
            const char* end = str_ref.end();
816
47
            int char_size = 0;
817
316
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
269
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
269
            }
820
47
            const auto start_byte_len = p - str_ref.begin();
821
822
47
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
31
                dest_column_data[i] = 0;
824
31
            } else {
825
16
                dest_column_data[i] =
826
16
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
16
            }
828
47
        }
829
23
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb0ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
804
22
                  size_t size) const {
805
44
        for (size_t i = 0; i < size; i++) {
806
22
            const StringRef str_ref =
807
22
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
22
            const StringRef pattern_ref =
809
22
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
22
            int32_t start_pos =
812
22
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
22
            const char* p = str_ref.begin();
815
22
            const char* end = str_ref.end();
816
22
            int char_size = 0;
817
151
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
129
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
129
            }
820
22
            const auto start_byte_len = p - str_ref.begin();
821
822
22
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
14
                dest_column_data[i] = 0;
824
14
            } else {
825
8
                dest_column_data[i] =
826
8
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
8
            }
828
22
        }
829
22
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb1ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
804
22
                  size_t size) const {
805
44
        for (size_t i = 0; i < size; i++) {
806
22
            const StringRef str_ref =
807
22
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
22
            const StringRef pattern_ref =
809
22
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
22
            int32_t start_pos =
812
22
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
22
            const char* p = str_ref.begin();
815
22
            const char* end = str_ref.end();
816
22
            int char_size = 0;
817
151
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
129
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
129
            }
820
22
            const auto start_byte_len = p - str_ref.begin();
821
822
22
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
14
                dest_column_data[i] = 0;
824
14
            } else {
825
8
                dest_column_data[i] =
826
8
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
8
            }
828
22
        }
829
22
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb1ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
804
22
                  size_t size) const {
805
44
        for (size_t i = 0; i < size; i++) {
806
22
            const StringRef str_ref =
807
22
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
22
            const StringRef pattern_ref =
809
22
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
22
            int32_t start_pos =
812
22
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
22
            const char* p = str_ref.begin();
815
22
            const char* end = str_ref.end();
816
22
            int char_size = 0;
817
151
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
129
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
129
            }
820
22
            const auto start_byte_len = p - str_ref.begin();
821
822
22
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
14
                dest_column_data[i] = 0;
824
14
            } else {
825
8
                dest_column_data[i] =
826
8
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
8
            }
828
22
        }
829
22
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb0ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
804
22
                  size_t size) const {
805
44
        for (size_t i = 0; i < size; i++) {
806
22
            const StringRef str_ref =
807
22
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
22
            const StringRef pattern_ref =
809
22
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
22
            int32_t start_pos =
812
22
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
22
            const char* p = str_ref.begin();
815
22
            const char* end = str_ref.end();
816
22
            int char_size = 0;
817
151
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
129
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
129
            }
820
22
            const auto start_byte_len = p - str_ref.begin();
821
822
22
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
14
                dest_column_data[i] = 0;
824
14
            } else {
825
8
                dest_column_data[i] =
826
8
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
8
            }
828
22
        }
829
22
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb0ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
804
22
                  size_t size) const {
805
44
        for (size_t i = 0; i < size; i++) {
806
22
            const StringRef str_ref =
807
22
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
22
            const StringRef pattern_ref =
809
22
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
22
            int32_t start_pos =
812
22
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
22
            const char* p = str_ref.begin();
815
22
            const char* end = str_ref.end();
816
22
            int char_size = 0;
817
151
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
129
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
129
            }
820
22
            const auto start_byte_len = p - str_ref.begin();
821
822
22
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
14
                dest_column_data[i] = 0;
824
14
            } else {
825
8
                dest_column_data[i] =
826
8
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
8
            }
828
22
        }
829
22
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb1ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
Line
Count
Source
804
22
                  size_t size) const {
805
44
        for (size_t i = 0; i < size; i++) {
806
22
            const StringRef str_ref =
807
22
                    src_column_string.get_data_at(index_check_const<src_const>(i));
808
22
            const StringRef pattern_ref =
809
22
                    pattern_column.get_data_at(index_check_const<pattern_const>(i));
810
            // 1-based index
811
22
            int32_t start_pos =
812
22
                    start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1;
813
814
22
            const char* p = str_ref.begin();
815
22
            const char* end = str_ref.end();
816
22
            int char_size = 0;
817
151
            for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) {
818
129
                char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
819
129
            }
820
22
            const auto start_byte_len = p - str_ref.begin();
821
822
22
            if (start_pos < 0 || start_byte_len >= str_ref.size) {
823
14
                dest_column_data[i] = 0;
824
14
            } else {
825
8
                dest_column_data[i] =
826
8
                        find_str_count(str_ref.substring(start_byte_len), pattern_ref);
827
8
            }
828
22
        }
829
22
    }
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb1ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm
830
831
208
    size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const {
832
208
        size_t old_size = pos;
833
208
        size_t str_size = str_ref.size;
834
1.15k
        while (pos < str_size &&
835
1.15k
               memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos,
836
1.06k
                                             (const uint8_t*)pattern_ref.data, pattern_ref.size)) {
837
948
            pos++;
838
948
        }
839
208
        return pos - old_size;
840
208
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8find_posEmNS_9StringRefES3_
Line
Count
Source
831
56
    size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const {
832
56
        size_t old_size = pos;
833
56
        size_t str_size = str_ref.size;
834
372
        while (pos < str_size &&
835
372
               memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos,
836
344
                                             (const uint8_t*)pattern_ref.data, pattern_ref.size)) {
837
316
            pos++;
838
316
        }
839
56
        return pos - old_size;
840
56
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8find_posEmNS_9StringRefES3_
Line
Count
Source
831
152
    size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const {
832
152
        size_t old_size = pos;
833
152
        size_t str_size = str_ref.size;
834
784
        while (pos < str_size &&
835
784
               memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos,
836
720
                                             (const uint8_t*)pattern_ref.data, pattern_ref.size)) {
837
632
            pos++;
838
632
        }
839
152
        return pos - old_size;
840
152
    }
841
842
111
    int find_str_count(const StringRef str_ref, StringRef pattern_ref) const {
843
111
        int count = 0;
844
111
        if (str_ref.size == 0 || pattern_ref.size == 0) {
845
19
            return 0;
846
92
        } else {
847
208
            for (size_t str_pos = 0; str_pos <= str_ref.size;) {
848
208
                const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref);
849
208
                if (res_pos == (str_ref.size - str_pos)) {
850
92
                    break; // not find
851
92
                }
852
116
                count++;
853
116
                str_pos = str_pos + res_pos + pattern_ref.size;
854
116
            }
855
92
        }
856
92
        return count;
857
111
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE14find_str_countENS_9StringRefES3_
Line
Count
Source
842
47
    int find_str_count(const StringRef str_ref, StringRef pattern_ref) const {
843
47
        int count = 0;
844
47
        if (str_ref.size == 0 || pattern_ref.size == 0) {
845
19
            return 0;
846
28
        } else {
847
56
            for (size_t str_pos = 0; str_pos <= str_ref.size;) {
848
56
                const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref);
849
56
                if (res_pos == (str_ref.size - str_pos)) {
850
28
                    break; // not find
851
28
                }
852
28
                count++;
853
28
                str_pos = str_pos + res_pos + pattern_ref.size;
854
28
            }
855
28
        }
856
28
        return count;
857
47
    }
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE14find_str_countENS_9StringRefES3_
Line
Count
Source
842
64
    int find_str_count(const StringRef str_ref, StringRef pattern_ref) const {
843
64
        int count = 0;
844
64
        if (str_ref.size == 0 || pattern_ref.size == 0) {
845
0
            return 0;
846
64
        } else {
847
152
            for (size_t str_pos = 0; str_pos <= str_ref.size;) {
848
152
                const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref);
849
152
                if (res_pos == (str_ref.size - str_pos)) {
850
64
                    break; // not find
851
64
                }
852
88
                count++;
853
88
                str_pos = str_pos + res_pos + pattern_ref.size;
854
88
            }
855
64
        }
856
64
        return count;
857
64
    }
858
};
859
860
1
void register_function_string_search(SimpleFunctionFactory& factory) {
861
1
    factory.register_function<FunctionStringLocatePos>();
862
1
    factory.register_function<FunctionSplitPart>();
863
1
    factory.register_function<FunctionSplitByString>();
864
1
    factory.register_function<FunctionCountSubString<FunctionCountSubStringType::TWO_ARGUMENTS>>();
865
1
    factory.register_function<
866
1
            FunctionCountSubString<FunctionCountSubStringType::THREE_ARGUMENTS>>();
867
1
    factory.register_function<FunctionSubstringIndex>();
868
869
1
    factory.register_alias(FunctionStringLocatePos::name, "position");
870
1
}
871
872
#include "common/compile_check_avoid_end.h"
873
} // namespace doris