be/src/exprs/function/function_string_search.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <cstddef> |
19 | | #include <cstring> |
20 | | #include <numeric> |
21 | | #include <string> |
22 | | #include <string_view> |
23 | | #include <vector> |
24 | | |
25 | | #include "common/status.h" |
26 | | #include "core/assert_cast.h" |
27 | | #include "core/block/block.h" |
28 | | #include "core/block/column_numbers.h" |
29 | | #include "core/column/column_array.h" |
30 | | #include "core/column/column_const.h" |
31 | | #include "core/column/column_nullable.h" |
32 | | #include "core/column/column_string.h" |
33 | | #include "core/column/column_vector.h" |
34 | | #include "core/data_type/data_type_array.h" |
35 | | #include "core/data_type/data_type_nullable.h" |
36 | | #include "core/data_type/data_type_number.h" |
37 | | #include "core/data_type/data_type_string.h" |
38 | | #include "core/data_type/define_primitive_type.h" |
39 | | #include "core/memcmp_small.h" |
40 | | #include "core/memcpy_small.h" |
41 | | #include "core/pod_array_fwd.h" |
42 | | #include "core/string_ref.h" |
43 | | #include "exec/common/stringop_substring.h" |
44 | | #include "exec/common/template_helpers.hpp" |
45 | | #include "exec/common/util.hpp" |
46 | | #include "exprs/function/function.h" |
47 | | #include "exprs/function/function_helpers.h" |
48 | | #include "exprs/function/simple_function_factory.h" |
49 | | #include "exprs/function_context.h" |
50 | | #include "util/simd/vstring_function.h" |
51 | | #include "util/string_search.hpp" |
52 | | |
53 | | namespace doris { |
54 | | #include "common/compile_check_avoid_begin.h" |
55 | | |
56 | | class FunctionStringLocatePos : public IFunction { |
57 | | public: |
58 | | static constexpr auto name = "locate"; |
59 | 873 | static FunctionPtr create() { return std::make_shared<FunctionStringLocatePos>(); } |
60 | 0 | String get_name() const override { return name; } |
61 | 0 | size_t get_number_of_arguments() const override { return 3; } |
62 | | |
63 | 864 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
64 | 864 | return std::make_shared<DataTypeInt32>(); |
65 | 864 | } |
66 | | |
67 | 8 | DataTypes get_variadic_argument_types_impl() const override { |
68 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
69 | 8 | std::make_shared<DataTypeInt32>()}; |
70 | 8 | } |
71 | | |
72 | 865 | bool is_variadic() const override { return true; } |
73 | | |
74 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
75 | 650 | uint32_t result, size_t input_rows_count) const override { |
76 | 650 | if (arguments.size() != 3) { |
77 | 0 | return Status::InvalidArgument("Function {} requires 3 arguments, but got {}", |
78 | 0 | get_name(), arguments.size()); |
79 | 0 | } |
80 | 650 | bool col_const[3]; |
81 | 650 | ColumnPtr argument_columns[3]; |
82 | 2.59k | for (int i = 0; i < 3; ++i) { |
83 | 1.94k | std::tie(argument_columns[i], col_const[i]) = |
84 | 1.94k | unpack_if_const(block.get_by_position(arguments[i]).column); |
85 | 1.94k | } |
86 | | |
87 | 650 | const auto* col_left = assert_cast<const ColumnString*>(argument_columns[0].get()); |
88 | 650 | const auto* col_right = assert_cast<const ColumnString*>(argument_columns[1].get()); |
89 | 650 | const auto* col_pos = assert_cast<const ColumnInt32*>(argument_columns[2].get()); |
90 | | |
91 | 650 | ColumnInt32::MutablePtr col_res = ColumnInt32::create(); |
92 | 650 | auto& vec_res = col_res->get_data(); |
93 | 650 | vec_res.resize(block.rows()); |
94 | | |
95 | 650 | const bool is_ascii = col_left->is_ascii() && col_right->is_ascii(); |
96 | | |
97 | 650 | if (col_const[0]) { |
98 | 250 | std::visit( |
99 | 250 | [&](auto is_ascii, auto str_const, auto pos_const) { |
100 | 250 | scalar_search<is_ascii, str_const, pos_const>( |
101 | 250 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, |
102 | 250 | input_rows_count); |
103 | 250 | }, _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESG_SG_EEDaSA_SB_SC_ Line | Count | Source | 99 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 22 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 22 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 22 | input_rows_count); | 103 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESG_SF_IbLb1EEEEDaSA_SB_SC_ Line | Count | Source | 99 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 22 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 22 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 22 | input_rows_count); | 103 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESF_IbLb1EESG_EEDaSA_SB_SC_ Line | Count | Source | 99 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 22 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 22 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 22 | input_rows_count); | 103 | 22 | }, |
Unexecuted instantiation: _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESF_IbLb1EESH_EEDaSA_SB_SC_ _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESF_IbLb0EESH_EEDaSA_SB_SC_ Line | Count | Source | 99 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 60 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 60 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 60 | input_rows_count); | 103 | 60 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESF_IbLb0EESG_EEDaSA_SB_SC_ Line | Count | Source | 99 | 64 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 64 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 64 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 64 | input_rows_count); | 103 | 64 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESG_SF_IbLb0EEEEDaSA_SB_SC_ Line | Count | Source | 99 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 60 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 60 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 60 | input_rows_count); | 103 | 60 | }, |
Unexecuted instantiation: _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESG_SG_EEDaSA_SB_SC_ |
104 | 250 | make_bool_variant(is_ascii), make_bool_variant(col_const[1]), |
105 | 250 | make_bool_variant(col_const[2])); |
106 | | |
107 | 400 | } else { |
108 | 400 | std::visit( |
109 | 401 | [&](auto is_ascii, auto str_const, auto pos_const) { |
110 | 401 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, |
111 | 401 | col_pos->get_data(), vec_res, |
112 | 401 | input_rows_count); |
113 | 401 | }, _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESG_SG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 39 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 39 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 39 | col_pos->get_data(), vec_res, | 112 | 39 | input_rows_count); | 113 | 39 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESG_SF_IbLb1EEEEDaSA_SB_SC_ Line | Count | Source | 109 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 22 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 22 | col_pos->get_data(), vec_res, | 112 | 22 | input_rows_count); | 113 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESF_IbLb1EESG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 22 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 22 | col_pos->get_data(), vec_res, | 112 | 22 | input_rows_count); | 113 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESF_IbLb1EESH_EEDaSA_SB_SC_ Line | Count | Source | 109 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 22 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 22 | col_pos->get_data(), vec_res, | 112 | 22 | input_rows_count); | 113 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESF_IbLb0EESH_EEDaSA_SB_SC_ Line | Count | Source | 109 | 116 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 116 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 116 | col_pos->get_data(), vec_res, | 112 | 116 | input_rows_count); | 113 | 116 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESF_IbLb0EESG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 60 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 60 | col_pos->get_data(), vec_res, | 112 | 60 | input_rows_count); | 113 | 60 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESG_SF_IbLb0EEEEDaSA_SB_SC_ Line | Count | Source | 109 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 60 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 60 | col_pos->get_data(), vec_res, | 112 | 60 | input_rows_count); | 113 | 60 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESG_SG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 60 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 60 | col_pos->get_data(), vec_res, | 112 | 60 | input_rows_count); | 113 | 60 | }, |
|
114 | 400 | make_bool_variant(is_ascii), make_bool_variant(col_const[1]), |
115 | 400 | make_bool_variant(col_const[2])); |
116 | 400 | } |
117 | 650 | block.replace_by_position(result, std::move(col_res)); |
118 | 650 | return Status::OK(); |
119 | 650 | } |
120 | | |
121 | | private: |
122 | | template <bool is_ascii, bool str_const, bool pos_const> |
123 | | void scalar_search(const StringRef& ldata, const ColumnString* col_right, |
124 | | const PaddedPODArray<Int32>& posdata, PaddedPODArray<Int32>& res, |
125 | 250 | size_t size) const { |
126 | 250 | res.resize(size); |
127 | 250 | StringRef substr(ldata.data, ldata.size); |
128 | 250 | StringSearch search {&substr}; |
129 | | |
130 | 521 | for (int i = 0; i < size; ++i) { |
131 | 271 | res[i] = locate_pos<is_ascii>(substr, |
132 | 271 | col_right->get_data_at(index_check_const<str_const>(i)), |
133 | 271 | search, posdata[index_check_const<pos_const>(i)]); |
134 | 271 | } |
135 | 250 | } _ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb0ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 22 | size_t size) const { | 126 | 22 | res.resize(size); | 127 | 22 | StringRef substr(ldata.data, ldata.size); | 128 | 22 | StringSearch search {&substr}; | 129 | | | 130 | 44 | for (int i = 0; i < size; ++i) { | 131 | 22 | res[i] = locate_pos<is_ascii>(substr, | 132 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 22 | } | 135 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb0ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 22 | size_t size) const { | 126 | 22 | res.resize(size); | 127 | 22 | StringRef substr(ldata.data, ldata.size); | 128 | 22 | StringSearch search {&substr}; | 129 | | | 130 | 44 | for (int i = 0; i < size; ++i) { | 131 | 22 | res[i] = locate_pos<is_ascii>(substr, | 132 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 22 | } | 135 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb1ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 22 | size_t size) const { | 126 | 22 | res.resize(size); | 127 | 22 | StringRef substr(ldata.data, ldata.size); | 128 | 22 | StringSearch search {&substr}; | 129 | | | 130 | 44 | for (int i = 0; i < size; ++i) { | 131 | 22 | res[i] = locate_pos<is_ascii>(substr, | 132 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 22 | } | 135 | 22 | } |
Unexecuted instantiation: _ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb1ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m _ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb0ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 60 | size_t size) const { | 126 | 60 | res.resize(size); | 127 | 60 | StringRef substr(ldata.data, ldata.size); | 128 | 60 | StringSearch search {&substr}; | 129 | | | 130 | 120 | for (int i = 0; i < size; ++i) { | 131 | 60 | res[i] = locate_pos<is_ascii>(substr, | 132 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 60 | } | 135 | 60 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb0ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 64 | size_t size) const { | 126 | 64 | res.resize(size); | 127 | 64 | StringRef substr(ldata.data, ldata.size); | 128 | 64 | StringSearch search {&substr}; | 129 | | | 130 | 149 | for (int i = 0; i < size; ++i) { | 131 | 85 | res[i] = locate_pos<is_ascii>(substr, | 132 | 85 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 85 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 85 | } | 135 | 64 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb1ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 60 | size_t size) const { | 126 | 60 | res.resize(size); | 127 | 60 | StringRef substr(ldata.data, ldata.size); | 128 | 60 | StringSearch search {&substr}; | 129 | | | 130 | 120 | for (int i = 0; i < size; ++i) { | 131 | 60 | res[i] = locate_pos<is_ascii>(substr, | 132 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 60 | } | 135 | 60 | } |
Unexecuted instantiation: _ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb1ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m |
136 | | |
137 | | template <bool is_ascii, bool str_const, bool pos_const> |
138 | | void vector_search(const ColumnString* col_left, const ColumnString* col_right, |
139 | | const PaddedPODArray<Int32>& posdata, PaddedPODArray<Int32>& res, |
140 | 401 | size_t size) const { |
141 | 401 | res.resize(size); |
142 | 401 | StringSearch search; |
143 | 984 | for (int i = 0; i < size; ++i) { |
144 | 583 | StringRef substr = col_left->get_data_at(i); |
145 | 583 | search.set_pattern(&substr); |
146 | 583 | res[i] = locate_pos<is_ascii>(substr, |
147 | 583 | col_right->get_data_at(index_check_const<str_const>(i)), |
148 | 583 | search, posdata[index_check_const<pos_const>(i)]); |
149 | 583 | } |
150 | 401 | } _ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb0ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 39 | size_t size) const { | 141 | 39 | res.resize(size); | 142 | 39 | StringSearch search; | 143 | 103 | for (int i = 0; i < size; ++i) { | 144 | 64 | StringRef substr = col_left->get_data_at(i); | 145 | 64 | search.set_pattern(&substr); | 146 | 64 | res[i] = locate_pos<is_ascii>(substr, | 147 | 64 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 64 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 64 | } | 150 | 39 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb0ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 22 | size_t size) const { | 141 | 22 | res.resize(size); | 142 | 22 | StringSearch search; | 143 | 44 | for (int i = 0; i < size; ++i) { | 144 | 22 | StringRef substr = col_left->get_data_at(i); | 145 | 22 | search.set_pattern(&substr); | 146 | 22 | res[i] = locate_pos<is_ascii>(substr, | 147 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 22 | } | 150 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb1ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 22 | size_t size) const { | 141 | 22 | res.resize(size); | 142 | 22 | StringSearch search; | 143 | 44 | for (int i = 0; i < size; ++i) { | 144 | 22 | StringRef substr = col_left->get_data_at(i); | 145 | 22 | search.set_pattern(&substr); | 146 | 22 | res[i] = locate_pos<is_ascii>(substr, | 147 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 22 | } | 150 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb1ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 22 | size_t size) const { | 141 | 22 | res.resize(size); | 142 | 22 | StringSearch search; | 143 | 44 | for (int i = 0; i < size; ++i) { | 144 | 22 | StringRef substr = col_left->get_data_at(i); | 145 | 22 | search.set_pattern(&substr); | 146 | 22 | res[i] = locate_pos<is_ascii>(substr, | 147 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 22 | } | 150 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb0ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 116 | size_t size) const { | 141 | 116 | res.resize(size); | 142 | 116 | StringSearch search; | 143 | 389 | for (int i = 0; i < size; ++i) { | 144 | 273 | StringRef substr = col_left->get_data_at(i); | 145 | 273 | search.set_pattern(&substr); | 146 | 273 | res[i] = locate_pos<is_ascii>(substr, | 147 | 273 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 273 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 273 | } | 150 | 116 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb0ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 60 | size_t size) const { | 141 | 60 | res.resize(size); | 142 | 60 | StringSearch search; | 143 | 120 | for (int i = 0; i < size; ++i) { | 144 | 60 | StringRef substr = col_left->get_data_at(i); | 145 | 60 | search.set_pattern(&substr); | 146 | 60 | res[i] = locate_pos<is_ascii>(substr, | 147 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 60 | } | 150 | 60 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb1ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 60 | size_t size) const { | 141 | 60 | res.resize(size); | 142 | 60 | StringSearch search; | 143 | 120 | for (int i = 0; i < size; ++i) { | 144 | 60 | StringRef substr = col_left->get_data_at(i); | 145 | 60 | search.set_pattern(&substr); | 146 | 60 | res[i] = locate_pos<is_ascii>(substr, | 147 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 60 | } | 150 | 60 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb1ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 60 | size_t size) const { | 141 | 60 | res.resize(size); | 142 | 60 | StringSearch search; | 143 | 120 | for (int i = 0; i < size; ++i) { | 144 | 60 | StringRef substr = col_left->get_data_at(i); | 145 | 60 | search.set_pattern(&substr); | 146 | 60 | res[i] = locate_pos<is_ascii>(substr, | 147 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 60 | } | 150 | 60 | } |
|
151 | | |
152 | | template <bool is_ascii> |
153 | 854 | int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const { |
154 | 854 | if (str.size == 0 && substr.size == 0 && start_pos == 1) { |
155 | | // BEHAVIOR COMPATIBLE WITH MYSQL |
156 | | // locate('','') locate('','',1) locate('','',2) |
157 | | // 1 1 0 |
158 | 13 | return 1; |
159 | 13 | } |
160 | 841 | if (is_ascii) { |
161 | 645 | return locate_pos_ascii(substr, str, search, start_pos); |
162 | 645 | } else { |
163 | 196 | return locate_pos_utf8(substr, str, search, start_pos); |
164 | 196 | } |
165 | 841 | } _ZNK5doris23FunctionStringLocatePos10locate_posILb0EEEiNS_9StringRefES2_RNS_12StringSearchEi Line | Count | Source | 153 | 196 | int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const { | 154 | 196 | if (str.size == 0 && substr.size == 0 && start_pos == 1) { | 155 | | // BEHAVIOR COMPATIBLE WITH MYSQL | 156 | | // locate('','') locate('','',1) locate('','',2) | 157 | | // 1 1 0 | 158 | 0 | return 1; | 159 | 0 | } | 160 | 196 | if (is_ascii) { | 161 | 0 | return locate_pos_ascii(substr, str, search, start_pos); | 162 | 196 | } else { | 163 | 196 | return locate_pos_utf8(substr, str, search, start_pos); | 164 | 196 | } | 165 | 196 | } |
_ZNK5doris23FunctionStringLocatePos10locate_posILb1EEEiNS_9StringRefES2_RNS_12StringSearchEi Line | Count | Source | 153 | 658 | int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const { | 154 | 658 | if (str.size == 0 && substr.size == 0 && start_pos == 1) { | 155 | | // BEHAVIOR COMPATIBLE WITH MYSQL | 156 | | // locate('','') locate('','',1) locate('','',2) | 157 | | // 1 1 0 | 158 | 13 | return 1; | 159 | 13 | } | 160 | 645 | if (is_ascii) { | 161 | 645 | return locate_pos_ascii(substr, str, search, start_pos); | 162 | 645 | } else { | 163 | 0 | return locate_pos_utf8(substr, str, search, start_pos); | 164 | 0 | } | 165 | 645 | } |
|
166 | | |
167 | | int locate_pos_utf8(StringRef substr, StringRef str, StringSearch& search, |
168 | 196 | int start_pos) const { |
169 | 196 | std::vector<size_t> index; |
170 | 196 | size_t char_len = simd::VStringFunctions::get_char_len(str.data, str.size, index); |
171 | 196 | if (start_pos <= 0 || start_pos > char_len) { |
172 | 49 | return 0; |
173 | 49 | } |
174 | 147 | if (substr.size == 0) { |
175 | 18 | return start_pos; |
176 | 18 | } |
177 | | // Input start_pos starts from 1. |
178 | 129 | StringRef adjusted_str(str.data + index[start_pos - 1], str.size - index[start_pos - 1]); |
179 | 129 | int32_t match_pos = search.search(&adjusted_str); |
180 | 129 | if (match_pos >= 0) { |
181 | | // Hive returns the position in the original string starting from 1. |
182 | 111 | return start_pos + simd::VStringFunctions::get_char_len(adjusted_str.data, match_pos); |
183 | 111 | } else { |
184 | 18 | return 0; |
185 | 18 | } |
186 | 129 | } |
187 | | |
188 | | int locate_pos_ascii(StringRef substr, StringRef str, StringSearch& search, |
189 | 645 | int start_pos) const { |
190 | 645 | if (start_pos <= 0 || start_pos > str.size) { |
191 | 412 | return 0; |
192 | 412 | } |
193 | 233 | if (substr.size == 0) { |
194 | 38 | return start_pos; |
195 | 38 | } |
196 | | // Input start_pos starts from 1. |
197 | 195 | StringRef adjusted_str(str.data + start_pos - 1, str.size - start_pos + 1); |
198 | 195 | int32_t match_pos = search.search(&adjusted_str); |
199 | 195 | if (match_pos >= 0) { |
200 | | // Hive returns the position in the original string starting from 1. |
201 | 62 | return start_pos + match_pos; |
202 | 133 | } else { |
203 | 133 | return 0; |
204 | 133 | } |
205 | 195 | } |
206 | | }; |
207 | | |
208 | | class FunctionSplitPart : public IFunction { |
209 | | public: |
210 | | static constexpr auto name = "split_part"; |
211 | 145 | static FunctionPtr create() { return std::make_shared<FunctionSplitPart>(); } |
212 | 1 | String get_name() const override { return name; } |
213 | 136 | size_t get_number_of_arguments() const override { return 3; } |
214 | | |
215 | 136 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
216 | 136 | return make_nullable(std::make_shared<DataTypeString>()); |
217 | 136 | } |
218 | | |
219 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
220 | 151 | uint32_t result, size_t input_rows_count) const override { |
221 | 151 | DCHECK_EQ(arguments.size(), 3); |
222 | | |
223 | 151 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
224 | | // Create a zero column to simply implement |
225 | 151 | auto const_null_map = ColumnUInt8::create(input_rows_count, 0); |
226 | 151 | auto res = ColumnString::create(); |
227 | | |
228 | 151 | auto& null_map_data = null_map->get_data(); |
229 | 151 | auto& res_offsets = res->get_offsets(); |
230 | 151 | auto& res_chars = res->get_chars(); |
231 | 151 | res_offsets.resize(input_rows_count); |
232 | | |
233 | 151 | const size_t argument_size = arguments.size(); |
234 | 151 | std::vector<ColumnPtr> argument_columns(argument_size); |
235 | 604 | for (size_t i = 0; i < argument_size; ++i) { |
236 | 453 | argument_columns[i] = |
237 | 453 | block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); |
238 | 453 | if (const auto* nullable = |
239 | 453 | check_and_get_column<const ColumnNullable>(*argument_columns[i])) { |
240 | | // Danger: Here must dispose the null map data first! Because |
241 | | // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem |
242 | | // of column nullable mem of null map |
243 | 0 | VectorizedUtils::update_null_map(null_map->get_data(), |
244 | 0 | nullable->get_null_map_data()); |
245 | 0 | argument_columns[i] = nullable->get_nested_column_ptr(); |
246 | 0 | } |
247 | 453 | } |
248 | | |
249 | 151 | const auto* str_col = assert_cast<const ColumnString*>(argument_columns[0].get()); |
250 | | |
251 | 151 | const auto* delimiter_col = assert_cast<const ColumnString*>(argument_columns[1].get()); |
252 | | |
253 | 151 | const auto* part_num_col = assert_cast<const ColumnInt32*>(argument_columns[2].get()); |
254 | 151 | const auto& part_num_col_data = part_num_col->get_data(); |
255 | | |
256 | 396 | for (size_t i = 0; i < input_rows_count; ++i) { |
257 | 245 | if (part_num_col_data[i] == 0) { |
258 | 11 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
259 | 11 | continue; |
260 | 11 | } |
261 | | |
262 | 234 | auto delimiter = delimiter_col->get_data_at(i); |
263 | 234 | auto delimiter_str = delimiter_col->get_data_at(i).to_string(); |
264 | 234 | auto part_number = part_num_col_data[i]; |
265 | 234 | auto str = str_col->get_data_at(i); |
266 | 234 | if (delimiter.size == 0) { |
267 | 9 | StringOP::push_empty_string(i, res_chars, res_offsets); |
268 | 9 | continue; |
269 | 9 | } |
270 | | |
271 | 225 | if (part_number > 0) { |
272 | 186 | if (delimiter.size == 1) { |
273 | | // If delimiter is a char, use memchr to split |
274 | 153 | int32_t pre_offset = -1; |
275 | 153 | int32_t offset = -1; |
276 | 153 | int32_t num = 0; |
277 | 258 | while (num < part_number) { |
278 | 215 | pre_offset = offset; |
279 | 215 | size_t n = str.size - offset - 1; |
280 | 215 | const char* pos = reinterpret_cast<const char*>( |
281 | 215 | memchr(str.data + offset + 1, delimiter_str[0], n)); |
282 | 215 | if (pos != nullptr) { |
283 | 105 | offset = pos - str.data; |
284 | 105 | num++; |
285 | 110 | } else { |
286 | 110 | offset = str.size; |
287 | 110 | num = (num == 0) ? 0 : num + 1; |
288 | 110 | break; |
289 | 110 | } |
290 | 215 | } |
291 | | |
292 | 153 | if (num == part_number) { |
293 | 70 | StringOP::push_value_string( |
294 | 70 | std::string_view { |
295 | 70 | reinterpret_cast<const char*>(str.data + pre_offset + 1), |
296 | 70 | (size_t)offset - pre_offset - 1}, |
297 | 70 | i, res_chars, res_offsets); |
298 | 83 | } else { |
299 | 83 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
300 | 83 | } |
301 | 153 | } else { |
302 | | // If delimiter is a string, use memmem to split |
303 | 33 | int32_t pre_offset = -delimiter.size; |
304 | 33 | int32_t offset = -delimiter.size; |
305 | 33 | int32_t num = 0; |
306 | 68 | while (num < part_number) { |
307 | 54 | pre_offset = offset; |
308 | 54 | size_t n = str.size - offset - delimiter.size; |
309 | 54 | char* pos = |
310 | 54 | reinterpret_cast<char*>(memmem(str.data + offset + delimiter.size, |
311 | 54 | n, delimiter.data, delimiter.size)); |
312 | 54 | if (pos != nullptr) { |
313 | 35 | offset = pos - str.data; |
314 | 35 | num++; |
315 | 35 | } else { |
316 | 19 | offset = str.size; |
317 | 19 | num = (num == 0) ? 0 : num + 1; |
318 | 19 | break; |
319 | 19 | } |
320 | 54 | } |
321 | | |
322 | 33 | if (num == part_number) { |
323 | 27 | StringOP::push_value_string( |
324 | 27 | std::string_view {reinterpret_cast<const char*>( |
325 | 27 | str.data + pre_offset + delimiter.size), |
326 | 27 | (size_t)offset - pre_offset - delimiter.size}, |
327 | 27 | i, res_chars, res_offsets); |
328 | 27 | } else { |
329 | 6 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
330 | 6 | } |
331 | 33 | } |
332 | 186 | } else { |
333 | 39 | part_number = -part_number; |
334 | 39 | auto str_str = str.to_string(); |
335 | 39 | int32_t offset = str.size; |
336 | 39 | int32_t pre_offset = offset; |
337 | 39 | int32_t num = 0; |
338 | 39 | auto substr = str_str; |
339 | 83 | while (num <= part_number && offset >= 0) { |
340 | 83 | offset = (int)substr.rfind(delimiter, offset); |
341 | 83 | if (offset != -1) { |
342 | 62 | if (++num == part_number) { |
343 | 18 | break; |
344 | 18 | } |
345 | 44 | pre_offset = offset; |
346 | 44 | offset = offset - 1; |
347 | 44 | substr = str_str.substr(0, pre_offset); |
348 | 44 | } else { |
349 | 21 | break; |
350 | 21 | } |
351 | 83 | } |
352 | 39 | num = (offset == -1 && num != 0) ? num + 1 : num; |
353 | | |
354 | 39 | if (num == part_number) { |
355 | 24 | if (offset == -1) { |
356 | 6 | StringOP::push_value_string(std::string_view {str.data, (size_t)pre_offset}, |
357 | 6 | i, res_chars, res_offsets); |
358 | 18 | } else { |
359 | 18 | StringOP::push_value_string( |
360 | 18 | std::string_view {str_str.substr( |
361 | 18 | offset + delimiter.size, |
362 | 18 | (size_t)pre_offset - offset - delimiter.size)}, |
363 | 18 | i, res_chars, res_offsets); |
364 | 18 | } |
365 | 24 | } else { |
366 | 15 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
367 | 15 | } |
368 | 39 | } |
369 | 225 | } |
370 | | |
371 | 151 | block.get_by_position(result).column = |
372 | 151 | ColumnNullable::create(std::move(res), std::move(null_map)); |
373 | 151 | return Status::OK(); |
374 | 151 | } |
375 | | }; |
376 | | |
377 | | class FunctionSubstringIndex : public IFunction { |
378 | | public: |
379 | | static constexpr auto name = "substring_index"; |
380 | 112 | static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); } |
381 | 1 | String get_name() const override { return name; } |
382 | 103 | size_t get_number_of_arguments() const override { return 3; } |
383 | | |
384 | 103 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
385 | 103 | return std::make_shared<DataTypeString>(); |
386 | 103 | } |
387 | | |
388 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
389 | 107 | uint32_t result, size_t input_rows_count) const override { |
390 | 107 | DCHECK_EQ(arguments.size(), 3); |
391 | | |
392 | | // Create a zero column to simply implement |
393 | 107 | auto res = ColumnString::create(); |
394 | | |
395 | 107 | auto& res_offsets = res->get_offsets(); |
396 | 107 | auto& res_chars = res->get_chars(); |
397 | 107 | res_offsets.resize(input_rows_count); |
398 | 107 | ColumnPtr content_column; |
399 | 107 | bool content_const = false; |
400 | 107 | std::tie(content_column, content_const) = |
401 | 107 | unpack_if_const(block.get_by_position(arguments[0]).column); |
402 | | |
403 | 107 | const auto* str_col = assert_cast<const ColumnString*>(content_column.get()); |
404 | | |
405 | | // Handle both constant and non-constant delimiter parameters |
406 | 107 | ColumnPtr delimiter_column_ptr; |
407 | 107 | bool delimiter_const = false; |
408 | 107 | std::tie(delimiter_column_ptr, delimiter_const) = |
409 | 107 | unpack_if_const(block.get_by_position(arguments[1]).column); |
410 | 107 | const auto* delimiter_col = assert_cast<const ColumnString*>(delimiter_column_ptr.get()); |
411 | | |
412 | 107 | ColumnPtr part_num_column_ptr; |
413 | 107 | bool part_num_const = false; |
414 | 107 | std::tie(part_num_column_ptr, part_num_const) = |
415 | 107 | unpack_if_const(block.get_by_position(arguments[2]).column); |
416 | 107 | const ColumnInt32* part_num_col = |
417 | 107 | assert_cast<const ColumnInt32*>(part_num_column_ptr.get()); |
418 | | |
419 | | // For constant multi-character delimiters, create StringRef and StringSearch only once |
420 | 107 | std::optional<StringRef> const_delimiter_ref; |
421 | 107 | std::optional<StringSearch> const_search; |
422 | 107 | if (delimiter_const && delimiter_col->get_data_at(0).size > 1) { |
423 | 0 | const_delimiter_ref.emplace(delimiter_col->get_data_at(0)); |
424 | 0 | const_search.emplace(&const_delimiter_ref.value()); |
425 | 0 | } |
426 | | |
427 | 297 | for (size_t i = 0; i < input_rows_count; ++i) { |
428 | 190 | auto str = str_col->get_data_at(content_const ? 0 : i); |
429 | 190 | auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i); |
430 | 190 | int32_t delimiter_size = delimiter.size; |
431 | | |
432 | 190 | auto part_number = part_num_col->get_element(part_num_const ? 0 : i); |
433 | | |
434 | 190 | if (part_number == 0 || delimiter_size == 0) { |
435 | 7 | StringOP::push_empty_string(i, res_chars, res_offsets); |
436 | 7 | continue; |
437 | 7 | } |
438 | | |
439 | 183 | if (part_number > 0) { |
440 | 128 | if (delimiter_size == 1) { |
441 | 85 | int32_t offset = -1; |
442 | 85 | int32_t num = 0; |
443 | 137 | while (num < part_number) { |
444 | 117 | size_t n = str.size - offset - 1; |
445 | 117 | const char* pos = reinterpret_cast<const char*>( |
446 | 117 | memchr(str.data + offset + 1, delimiter.data[0], n)); |
447 | 117 | if (pos != nullptr) { |
448 | 52 | offset = pos - str.data; |
449 | 52 | num++; |
450 | 65 | } else { |
451 | 65 | offset = str.size; |
452 | 65 | num = (num == 0) ? 0 : num + 1; |
453 | 65 | break; |
454 | 65 | } |
455 | 117 | } |
456 | | |
457 | 85 | if (num == part_number) { |
458 | 25 | StringOP::push_value_string(std::string_view {str.data, (size_t)offset}, i, |
459 | 25 | res_chars, res_offsets); |
460 | 60 | } else { |
461 | 60 | StringOP::push_value_string(std::string_view(str.data, str.size), i, |
462 | 60 | res_chars, res_offsets); |
463 | 60 | } |
464 | 85 | } else { |
465 | | // For multi-character delimiters |
466 | | // Use pre-created StringRef and StringSearch for constant delimiters |
467 | 43 | StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value() |
468 | 43 | : StringRef(delimiter); |
469 | 43 | const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr; |
470 | 43 | StringSearch local_search(&delimiter_ref); |
471 | 43 | if (!search_ptr) { |
472 | 43 | search_ptr = &local_search; |
473 | 43 | } |
474 | | |
475 | 43 | int32_t offset = -delimiter_size; |
476 | 43 | int32_t num = 0; |
477 | 86 | while (num < part_number) { |
478 | 59 | size_t n = str.size - offset - delimiter_size; |
479 | | // search first match delimter_ref index from src string among str_offset to end |
480 | 59 | const char* pos = search_ptr->search(str.data + offset + delimiter_size, n); |
481 | 59 | if (pos < str.data + str.size) { |
482 | 43 | offset = pos - str.data; |
483 | 43 | num++; |
484 | 43 | } else { |
485 | 16 | offset = str.size; |
486 | 16 | num = (num == 0) ? 0 : num + 1; |
487 | 16 | break; |
488 | 16 | } |
489 | 59 | } |
490 | | |
491 | 43 | if (num == part_number) { |
492 | 40 | StringOP::push_value_string(std::string_view {str.data, (size_t)offset}, i, |
493 | 40 | res_chars, res_offsets); |
494 | 40 | } else { |
495 | 3 | StringOP::push_value_string(std::string_view(str.data, str.size), i, |
496 | 3 | res_chars, res_offsets); |
497 | 3 | } |
498 | 43 | } |
499 | 128 | } else { |
500 | 55 | int neg_part_number = -part_number; |
501 | 55 | auto str_str = str.to_string(); |
502 | 55 | int32_t offset = str.size; |
503 | 55 | int32_t pre_offset = offset; |
504 | 55 | int32_t num = 0; |
505 | 55 | auto substr = str_str; |
506 | | |
507 | | // Use pre-created StringRef for constant delimiters |
508 | 55 | StringRef delimiter_str = const_delimiter_ref |
509 | 55 | ? const_delimiter_ref.value() |
510 | 55 | : StringRef(delimiter.data, delimiter.size); |
511 | | |
512 | 79 | while (num <= neg_part_number && offset >= 0) { |
513 | 79 | offset = (int)substr.rfind(delimiter_str, offset); |
514 | 79 | if (offset != -1) { |
515 | 63 | if (++num == neg_part_number) { |
516 | 39 | break; |
517 | 39 | } |
518 | 24 | pre_offset = offset; |
519 | 24 | offset = offset - 1; |
520 | 24 | substr = str_str.substr(0, pre_offset); |
521 | 24 | } else { |
522 | 16 | break; |
523 | 16 | } |
524 | 79 | } |
525 | 55 | num = (offset == -1 && num != 0) ? num + 1 : num; |
526 | | |
527 | 55 | if (num == neg_part_number) { |
528 | 43 | if (offset == -1) { |
529 | 4 | StringOP::push_value_string(std::string_view(str.data, str.size), i, |
530 | 4 | res_chars, res_offsets); |
531 | 39 | } else { |
532 | 39 | StringOP::push_value_string( |
533 | 39 | std::string_view {str.data + offset + delimiter_size, |
534 | 39 | str.size - offset - delimiter_size}, |
535 | 39 | i, res_chars, res_offsets); |
536 | 39 | } |
537 | 43 | } else { |
538 | 12 | StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, |
539 | 12 | res_offsets); |
540 | 12 | } |
541 | 55 | } |
542 | 183 | } |
543 | | |
544 | 107 | block.get_by_position(result).column = std::move(res); |
545 | 107 | return Status::OK(); |
546 | 107 | } |
547 | | }; |
548 | | |
549 | | class FunctionSplitByString : public IFunction { |
550 | | public: |
551 | | static constexpr auto name = "split_by_string"; |
552 | | |
553 | 135 | static FunctionPtr create() { return std::make_shared<FunctionSplitByString>(); } |
554 | | using NullMapType = PaddedPODArray<UInt8>; |
555 | | |
556 | 1 | String get_name() const override { return name; } |
557 | | |
558 | 127 | bool is_variadic() const override { return false; } |
559 | | |
560 | 126 | size_t get_number_of_arguments() const override { return 2; } |
561 | | |
562 | 126 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
563 | 126 | DCHECK(is_string_type(arguments[0]->get_primitive_type())) |
564 | 0 | << "first argument for function: " << name << " should be string" |
565 | 0 | << " and arguments[0] is " << arguments[0]->get_name(); |
566 | 126 | DCHECK(is_string_type(arguments[1]->get_primitive_type())) |
567 | 0 | << "second argument for function: " << name << " should be string" |
568 | 0 | << " and arguments[1] is " << arguments[1]->get_name(); |
569 | 126 | return std::make_shared<DataTypeArray>(make_nullable(arguments[0])); |
570 | 126 | } |
571 | | |
572 | | Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, |
573 | 166 | uint32_t result, size_t input_rows_count) const override { |
574 | 166 | DCHECK_EQ(arguments.size(), 2); |
575 | | |
576 | 166 | const auto& [src_column, left_const] = |
577 | 166 | unpack_if_const(block.get_by_position(arguments[0]).column); |
578 | 166 | const auto& [right_column, right_const] = |
579 | 166 | unpack_if_const(block.get_by_position(arguments[1]).column); |
580 | | |
581 | 166 | DataTypePtr right_column_type = block.get_by_position(arguments[1]).type; |
582 | 166 | DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; |
583 | 166 | auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), |
584 | 166 | ColumnArray::ColumnOffsets::create()); |
585 | | |
586 | 166 | dest_column_ptr->resize(0); |
587 | 166 | auto& dest_offsets = dest_column_ptr->get_offsets(); |
588 | | |
589 | 166 | auto& dest_nullable_col = assert_cast<ColumnNullable&>(dest_column_ptr->get_data()); |
590 | 166 | auto* dest_nested_column = dest_nullable_col.get_nested_column_ptr().get(); |
591 | | |
592 | 166 | const auto* col_str = assert_cast<const ColumnString*>(src_column.get()); |
593 | | |
594 | 166 | const auto* col_delimiter = assert_cast<const ColumnString*>(right_column.get()); |
595 | | |
596 | 166 | std::visit( |
597 | 166 | [&](auto src_const, auto delimiter_const) { |
598 | 166 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, |
599 | 166 | *dest_nested_column, dest_offsets, |
600 | 166 | input_rows_count); |
601 | 166 | }, _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESF_EEDaSA_SB_ Line | Count | Source | 597 | 55 | [&](auto src_const, auto delimiter_const) { | 598 | 55 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, | 599 | 55 | *dest_nested_column, dest_offsets, | 600 | 55 | input_rows_count); | 601 | 55 | }, |
_ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESE_IbLb1EEEEDaSA_SB_ Line | Count | Source | 597 | 103 | [&](auto src_const, auto delimiter_const) { | 598 | 103 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, | 599 | 103 | *dest_nested_column, dest_offsets, | 600 | 103 | input_rows_count); | 601 | 103 | }, |
_ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESE_IbLb0EEEEDaSA_SB_ Line | Count | Source | 597 | 8 | [&](auto src_const, auto delimiter_const) { | 598 | 8 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, | 599 | 8 | *dest_nested_column, dest_offsets, | 600 | 8 | input_rows_count); | 601 | 8 | }, |
Unexecuted instantiation: _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESF_EEDaSA_SB_ |
602 | 166 | make_bool_variant(left_const), make_bool_variant(right_const)); |
603 | | |
604 | | // all elements in dest_nested_column are not null |
605 | 166 | dest_nullable_col.get_null_map_column().get_data().resize_fill(dest_nested_column->size(), |
606 | 166 | false); |
607 | 166 | block.replace_by_position(result, std::move(dest_column_ptr)); |
608 | | |
609 | 166 | return Status::OK(); |
610 | 166 | } |
611 | | |
612 | | private: |
613 | | template <bool src_const, bool delimiter_const> |
614 | | void _execute(const ColumnString& src_column_string, const ColumnString& delimiter_column, |
615 | | IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, |
616 | 166 | size_t size) const { |
617 | 166 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); |
618 | 166 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); |
619 | 166 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); |
620 | 166 | column_string_chars.reserve(0); |
621 | | |
622 | 166 | ColumnArray::Offset64 string_pos = 0; |
623 | 166 | ColumnArray::Offset64 dest_pos = 0; |
624 | | |
625 | 166 | StringSearch search; |
626 | 166 | StringRef delimiter_ref_for_search; |
627 | | |
628 | 166 | if constexpr (delimiter_const) { |
629 | 103 | delimiter_ref_for_search = delimiter_column.get_data_at(0); |
630 | 103 | search.set_pattern(&delimiter_ref_for_search); |
631 | 103 | } |
632 | | |
633 | 1.25k | for (size_t i = 0; i < size; i++) { |
634 | 1.08k | const StringRef str_ref = |
635 | 1.08k | src_column_string.get_data_at(index_check_const<src_const>(i)); |
636 | 1.08k | const StringRef delimiter_ref = |
637 | 1.08k | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); |
638 | | |
639 | 1.08k | if (str_ref.size == 0) { |
640 | 164 | dest_offsets.push_back(dest_pos); |
641 | 164 | continue; |
642 | 164 | } |
643 | 920 | if (delimiter_ref.size == 0) { |
644 | 27 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, |
645 | 27 | string_pos, dest_pos); |
646 | 893 | } else { |
647 | 893 | if constexpr (!delimiter_const) { |
648 | 51 | search.set_pattern(&delimiter_ref); |
649 | 51 | } |
650 | 51.7k | for (size_t str_pos = 0; str_pos <= str_ref.size;) { |
651 | 50.8k | const size_t str_offset = str_pos; |
652 | 50.8k | const size_t old_size = column_string_chars.size(); |
653 | | // search first match delimter_ref index from src string among str_offset to end |
654 | 50.8k | const char* result_start = |
655 | 50.8k | search.search(str_ref.data + str_offset, str_ref.size - str_offset); |
656 | | // compute split part size |
657 | 50.8k | const size_t split_part_size = result_start - str_ref.data - str_offset; |
658 | | // save dist string split part |
659 | 50.8k | if (split_part_size > 0) { |
660 | 50.2k | const size_t new_size = old_size + split_part_size; |
661 | 50.2k | column_string_chars.resize(new_size); |
662 | 50.2k | memcpy_small_allow_read_write_overflow15( |
663 | 50.2k | column_string_chars.data() + old_size, str_ref.data + str_offset, |
664 | 50.2k | split_part_size); |
665 | | // add dist string offset |
666 | 50.2k | string_pos += split_part_size; |
667 | 50.2k | } |
668 | 50.8k | column_string_offsets.push_back(string_pos); |
669 | | // array offset + 1 |
670 | 50.8k | dest_pos++; |
671 | | // add src string str_pos to next search start |
672 | 50.8k | str_pos += split_part_size + delimiter_ref.size; |
673 | 50.8k | } |
674 | 893 | } |
675 | 920 | dest_offsets.push_back(dest_pos); |
676 | 920 | } |
677 | 166 | } _ZNK5doris21FunctionSplitByString8_executeILb0ELb0EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 616 | 55 | size_t size) const { | 617 | 55 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); | 618 | 55 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | 619 | 55 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | 620 | 55 | column_string_chars.reserve(0); | 621 | | | 622 | 55 | ColumnArray::Offset64 string_pos = 0; | 623 | 55 | ColumnArray::Offset64 dest_pos = 0; | 624 | | | 625 | 55 | StringSearch search; | 626 | 55 | StringRef delimiter_ref_for_search; | 627 | | | 628 | | if constexpr (delimiter_const) { | 629 | | delimiter_ref_for_search = delimiter_column.get_data_at(0); | 630 | | search.set_pattern(&delimiter_ref_for_search); | 631 | | } | 632 | | | 633 | 130 | for (size_t i = 0; i < size; i++) { | 634 | 75 | const StringRef str_ref = | 635 | 75 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 636 | 75 | const StringRef delimiter_ref = | 637 | 75 | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); | 638 | | | 639 | 75 | if (str_ref.size == 0) { | 640 | 13 | dest_offsets.push_back(dest_pos); | 641 | 13 | continue; | 642 | 13 | } | 643 | 62 | if (delimiter_ref.size == 0) { | 644 | 11 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, | 645 | 11 | string_pos, dest_pos); | 646 | 51 | } else { | 647 | 51 | if constexpr (!delimiter_const) { | 648 | 51 | search.set_pattern(&delimiter_ref); | 649 | 51 | } | 650 | 214 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 651 | 163 | const size_t str_offset = str_pos; | 652 | 163 | const size_t old_size = column_string_chars.size(); | 653 | | // search first match delimter_ref index from src string among str_offset to end | 654 | 163 | const char* result_start = | 655 | 163 | search.search(str_ref.data + str_offset, str_ref.size - str_offset); | 656 | | // compute split part size | 657 | 163 | const size_t split_part_size = result_start - str_ref.data - str_offset; | 658 | | // save dist string split part | 659 | 163 | if (split_part_size > 0) { | 660 | 122 | const size_t new_size = old_size + split_part_size; | 661 | 122 | column_string_chars.resize(new_size); | 662 | 122 | memcpy_small_allow_read_write_overflow15( | 663 | 122 | column_string_chars.data() + old_size, str_ref.data + str_offset, | 664 | 122 | split_part_size); | 665 | | // add dist string offset | 666 | 122 | string_pos += split_part_size; | 667 | 122 | } | 668 | 163 | column_string_offsets.push_back(string_pos); | 669 | | // array offset + 1 | 670 | 163 | dest_pos++; | 671 | | // add src string str_pos to next search start | 672 | 163 | str_pos += split_part_size + delimiter_ref.size; | 673 | 163 | } | 674 | 51 | } | 675 | 62 | dest_offsets.push_back(dest_pos); | 676 | 62 | } | 677 | 55 | } |
_ZNK5doris21FunctionSplitByString8_executeILb0ELb1EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 616 | 103 | size_t size) const { | 617 | 103 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); | 618 | 103 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | 619 | 103 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | 620 | 103 | column_string_chars.reserve(0); | 621 | | | 622 | 103 | ColumnArray::Offset64 string_pos = 0; | 623 | 103 | ColumnArray::Offset64 dest_pos = 0; | 624 | | | 625 | 103 | StringSearch search; | 626 | 103 | StringRef delimiter_ref_for_search; | 627 | | | 628 | 103 | if constexpr (delimiter_const) { | 629 | 103 | delimiter_ref_for_search = delimiter_column.get_data_at(0); | 630 | 103 | search.set_pattern(&delimiter_ref_for_search); | 631 | 103 | } | 632 | | | 633 | 1.08k | for (size_t i = 0; i < size; i++) { | 634 | 985 | const StringRef str_ref = | 635 | 985 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 636 | 985 | const StringRef delimiter_ref = | 637 | 985 | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); | 638 | | | 639 | 985 | if (str_ref.size == 0) { | 640 | 135 | dest_offsets.push_back(dest_pos); | 641 | 135 | continue; | 642 | 135 | } | 643 | 850 | if (delimiter_ref.size == 0) { | 644 | 8 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, | 645 | 8 | string_pos, dest_pos); | 646 | 842 | } else { | 647 | | if constexpr (!delimiter_const) { | 648 | | search.set_pattern(&delimiter_ref); | 649 | | } | 650 | 51.5k | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 651 | 50.7k | const size_t str_offset = str_pos; | 652 | 50.7k | const size_t old_size = column_string_chars.size(); | 653 | | // search first match delimter_ref index from src string among str_offset to end | 654 | 50.7k | const char* result_start = | 655 | 50.7k | search.search(str_ref.data + str_offset, str_ref.size - str_offset); | 656 | | // compute split part size | 657 | 50.7k | const size_t split_part_size = result_start - str_ref.data - str_offset; | 658 | | // save dist string split part | 659 | 50.7k | if (split_part_size > 0) { | 660 | 50.1k | const size_t new_size = old_size + split_part_size; | 661 | 50.1k | column_string_chars.resize(new_size); | 662 | 50.1k | memcpy_small_allow_read_write_overflow15( | 663 | 50.1k | column_string_chars.data() + old_size, str_ref.data + str_offset, | 664 | 50.1k | split_part_size); | 665 | | // add dist string offset | 666 | 50.1k | string_pos += split_part_size; | 667 | 50.1k | } | 668 | 50.7k | column_string_offsets.push_back(string_pos); | 669 | | // array offset + 1 | 670 | 50.7k | dest_pos++; | 671 | | // add src string str_pos to next search start | 672 | 50.7k | str_pos += split_part_size + delimiter_ref.size; | 673 | 50.7k | } | 674 | 842 | } | 675 | 850 | dest_offsets.push_back(dest_pos); | 676 | 850 | } | 677 | 103 | } |
_ZNK5doris21FunctionSplitByString8_executeILb1ELb0EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 616 | 8 | size_t size) const { | 617 | 8 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); | 618 | 8 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | 619 | 8 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | 620 | 8 | column_string_chars.reserve(0); | 621 | | | 622 | 8 | ColumnArray::Offset64 string_pos = 0; | 623 | 8 | ColumnArray::Offset64 dest_pos = 0; | 624 | | | 625 | 8 | StringSearch search; | 626 | 8 | StringRef delimiter_ref_for_search; | 627 | | | 628 | | if constexpr (delimiter_const) { | 629 | | delimiter_ref_for_search = delimiter_column.get_data_at(0); | 630 | | search.set_pattern(&delimiter_ref_for_search); | 631 | | } | 632 | | | 633 | 32 | for (size_t i = 0; i < size; i++) { | 634 | 24 | const StringRef str_ref = | 635 | 24 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 636 | 24 | const StringRef delimiter_ref = | 637 | 24 | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); | 638 | | | 639 | 24 | if (str_ref.size == 0) { | 640 | 16 | dest_offsets.push_back(dest_pos); | 641 | 16 | continue; | 642 | 16 | } | 643 | 8 | if (delimiter_ref.size == 0) { | 644 | 8 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, | 645 | 8 | string_pos, dest_pos); | 646 | 8 | } else { | 647 | 0 | if constexpr (!delimiter_const) { | 648 | 0 | search.set_pattern(&delimiter_ref); | 649 | 0 | } | 650 | 0 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 651 | 0 | const size_t str_offset = str_pos; | 652 | 0 | const size_t old_size = column_string_chars.size(); | 653 | | // search first match delimter_ref index from src string among str_offset to end | 654 | 0 | const char* result_start = | 655 | 0 | search.search(str_ref.data + str_offset, str_ref.size - str_offset); | 656 | | // compute split part size | 657 | 0 | const size_t split_part_size = result_start - str_ref.data - str_offset; | 658 | | // save dist string split part | 659 | 0 | if (split_part_size > 0) { | 660 | 0 | const size_t new_size = old_size + split_part_size; | 661 | 0 | column_string_chars.resize(new_size); | 662 | 0 | memcpy_small_allow_read_write_overflow15( | 663 | 0 | column_string_chars.data() + old_size, str_ref.data + str_offset, | 664 | 0 | split_part_size); | 665 | | // add dist string offset | 666 | 0 | string_pos += split_part_size; | 667 | 0 | } | 668 | 0 | column_string_offsets.push_back(string_pos); | 669 | | // array offset + 1 | 670 | 0 | dest_pos++; | 671 | | // add src string str_pos to next search start | 672 | 0 | str_pos += split_part_size + delimiter_ref.size; | 673 | 0 | } | 674 | 0 | } | 675 | 8 | dest_offsets.push_back(dest_pos); | 676 | 8 | } | 677 | 8 | } |
Unexecuted instantiation: _ZNK5doris21FunctionSplitByString8_executeILb1ELb1EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm |
678 | | |
679 | | void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars& column_string_chars, |
680 | | ColumnString::Offsets& column_string_offsets, |
681 | | ColumnArray::Offset64& string_pos, |
682 | 27 | ColumnArray::Offset64& dest_pos) const { |
683 | 27 | const size_t old_size = column_string_chars.size(); |
684 | 27 | const size_t new_size = old_size + str_ref.size; |
685 | 27 | column_string_chars.resize(new_size); |
686 | 27 | memcpy(column_string_chars.data() + old_size, str_ref.data, str_ref.size); |
687 | 27 | if (simd::VStringFunctions::is_ascii(str_ref)) { |
688 | 24 | const auto size = str_ref.size; |
689 | | |
690 | 24 | const auto nested_old_size = column_string_offsets.size(); |
691 | 24 | const auto nested_new_size = nested_old_size + size; |
692 | 24 | column_string_offsets.resize(nested_new_size); |
693 | 24 | std::iota(column_string_offsets.data() + nested_old_size, |
694 | 24 | column_string_offsets.data() + nested_new_size, string_pos + 1); |
695 | | |
696 | 24 | string_pos += size; |
697 | 24 | dest_pos += size; |
698 | | // The above code is equivalent to the code in the following comment. |
699 | | // for (size_t i = 0; i < str_ref.size; i++) { |
700 | | // string_pos++; |
701 | | // column_string_offsets.push_back(string_pos); |
702 | | // (*dest_nested_null_map).push_back(false); |
703 | | // dest_pos++; |
704 | | // } |
705 | 24 | } else { |
706 | 22 | for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += utf8_char_len) { |
707 | 19 | utf8_char_len = UTF8_BYTE_LENGTH[(unsigned char)str_ref.data[i]]; |
708 | | |
709 | 19 | string_pos += utf8_char_len; |
710 | 19 | column_string_offsets.push_back(string_pos); |
711 | 19 | dest_pos++; |
712 | 19 | } |
713 | 3 | } |
714 | 27 | } |
715 | | }; |
716 | | |
717 | | enum class FunctionCountSubStringType { TWO_ARGUMENTS, THREE_ARGUMENTS }; |
718 | | |
719 | | template <FunctionCountSubStringType type> |
720 | | class FunctionCountSubString : public IFunction { |
721 | | public: |
722 | | static constexpr auto name = "count_substrings"; |
723 | | static constexpr auto arg_count = (type == FunctionCountSubStringType::TWO_ARGUMENTS) ? 2 : 3; |
724 | | |
725 | 283 | static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); }_ZN5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE6createEv Line | Count | Source | 725 | 77 | static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); } |
_ZN5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE6createEv Line | Count | Source | 725 | 206 | static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); } |
|
726 | | using NullMapType = PaddedPODArray<UInt8>; |
727 | | |
728 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8get_nameB5cxx11Ev |
729 | | |
730 | 0 | size_t get_number_of_arguments() const override { return arg_count; }Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE23get_number_of_argumentsEv |
731 | | |
732 | 265 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
733 | 265 | return std::make_shared<DataTypeInt32>(); |
734 | 265 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 732 | 68 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 733 | 68 | return std::make_shared<DataTypeInt32>(); | 734 | 68 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 732 | 197 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 733 | 197 | return std::make_shared<DataTypeInt32>(); | 734 | 197 | } |
|
735 | | |
736 | 16 | DataTypes get_variadic_argument_types_impl() const override { |
737 | 16 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { |
738 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; |
739 | 8 | } else { |
740 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
741 | 8 | std::make_shared<DataTypeInt32>()}; |
742 | 8 | } |
743 | 16 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE32get_variadic_argument_types_implEv Line | Count | Source | 736 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 737 | 8 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 738 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; | 739 | | } else { | 740 | | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), | 741 | | std::make_shared<DataTypeInt32>()}; | 742 | | } | 743 | 8 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE32get_variadic_argument_types_implEv Line | Count | Source | 736 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 737 | | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 738 | | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; | 739 | 8 | } else { | 740 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), | 741 | 8 | std::make_shared<DataTypeInt32>()}; | 742 | 8 | } | 743 | 8 | } |
|
744 | | |
745 | 267 | bool is_variadic() const override { return true; }_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE11is_variadicEv Line | Count | Source | 745 | 69 | bool is_variadic() const override { return true; } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE11is_variadicEv Line | Count | Source | 745 | 198 | bool is_variadic() const override { return true; } |
|
746 | | |
747 | | Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, |
748 | 234 | uint32_t result, size_t input_rows_count) const override { |
749 | 234 | DCHECK(arg_count); |
750 | 234 | bool col_const[arg_count]; |
751 | 234 | ColumnPtr argument_columns[arg_count]; |
752 | 878 | for (int i = 0; i < arg_count; ++i) { |
753 | 644 | std::tie(argument_columns[i], col_const[i]) = |
754 | 644 | unpack_if_const(block.get_by_position(arguments[i]).column); |
755 | 644 | } |
756 | | |
757 | 234 | auto dest_column_ptr = ColumnInt32::create(input_rows_count); |
758 | 234 | auto& dest_column_data = dest_column_ptr->get_data(); |
759 | | |
760 | 234 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { |
761 | 58 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); |
762 | 58 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); |
763 | 58 | std::visit( |
764 | 58 | [&](auto str_const, auto pattern_const) { |
765 | 58 | _execute<str_const, pattern_const>(src_column_string, pattern_column, |
766 | 58 | dest_column_data, input_rows_count); |
767 | 58 | }, _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESH_EEDaSC_SD_ Line | Count | Source | 764 | 32 | [&](auto str_const, auto pattern_const) { | 765 | 32 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 766 | 32 | dest_column_data, input_rows_count); | 767 | 32 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESG_IbLb1EEEEDaSC_SD_ Line | Count | Source | 764 | 13 | [&](auto str_const, auto pattern_const) { | 765 | 13 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 766 | 13 | dest_column_data, input_rows_count); | 767 | 13 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESG_IbLb0EEEEDaSC_SD_ Line | Count | Source | 764 | 13 | [&](auto str_const, auto pattern_const) { | 765 | 13 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 766 | 13 | dest_column_data, input_rows_count); | 767 | 13 | }, |
Unexecuted instantiation: _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESH_EEDaSC_SD_ |
768 | 58 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1])); |
769 | 176 | } else { |
770 | 176 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); |
771 | 176 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); |
772 | 176 | const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]); |
773 | 176 | std::visit( |
774 | 176 | [&](auto str_const, auto pattern_const, auto start_pos_const) { |
775 | 176 | _execute<str_const, pattern_const, start_pos_const>( |
776 | 176 | src_column_string, pattern_column, start_pos_column, |
777 | 176 | dest_column_data, input_rows_count); |
778 | 176 | }, _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESI_SI_EEDaSC_SD_SE_ Line | Count | Source | 774 | 36 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 36 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 36 | src_column_string, pattern_column, start_pos_column, | 777 | 36 | dest_column_data, input_rows_count); | 778 | 36 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESI_SH_IbLb1EEEEDaSC_SD_SE_ Line | Count | Source | 774 | 29 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 29 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 29 | src_column_string, pattern_column, start_pos_column, | 777 | 29 | dest_column_data, input_rows_count); | 778 | 29 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESH_IbLb1EESI_EEDaSC_SD_SE_ Line | Count | Source | 774 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 22 | src_column_string, pattern_column, start_pos_column, | 777 | 22 | dest_column_data, input_rows_count); | 778 | 22 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESH_IbLb1EESJ_EEDaSC_SD_SE_ Line | Count | Source | 774 | 23 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 23 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 23 | src_column_string, pattern_column, start_pos_column, | 777 | 23 | dest_column_data, input_rows_count); | 778 | 23 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESH_IbLb0EESJ_EEDaSC_SD_SE_ Line | Count | Source | 774 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 22 | src_column_string, pattern_column, start_pos_column, | 777 | 22 | dest_column_data, input_rows_count); | 778 | 22 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESH_IbLb0EESI_EEDaSC_SD_SE_ Line | Count | Source | 774 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 22 | src_column_string, pattern_column, start_pos_column, | 777 | 22 | dest_column_data, input_rows_count); | 778 | 22 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESI_SH_IbLb0EEEEDaSC_SD_SE_ Line | Count | Source | 774 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 22 | src_column_string, pattern_column, start_pos_column, | 777 | 22 | dest_column_data, input_rows_count); | 778 | 22 | }, |
Unexecuted instantiation: _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESI_SI_EEDaSC_SD_SE_ |
779 | 176 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1]), |
780 | 176 | make_bool_variant(col_const[2])); |
781 | 176 | } |
782 | | |
783 | 234 | block.replace_by_position(result, std::move(dest_column_ptr)); |
784 | 234 | return Status::OK(); |
785 | 234 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 748 | 58 | uint32_t result, size_t input_rows_count) const override { | 749 | 58 | DCHECK(arg_count); | 750 | 58 | bool col_const[arg_count]; | 751 | 58 | ColumnPtr argument_columns[arg_count]; | 752 | 174 | for (int i = 0; i < arg_count; ++i) { | 753 | 116 | std::tie(argument_columns[i], col_const[i]) = | 754 | 116 | unpack_if_const(block.get_by_position(arguments[i]).column); | 755 | 116 | } | 756 | | | 757 | 58 | auto dest_column_ptr = ColumnInt32::create(input_rows_count); | 758 | 58 | auto& dest_column_data = dest_column_ptr->get_data(); | 759 | | | 760 | 58 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 761 | 58 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 762 | 58 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 763 | 58 | std::visit( | 764 | 58 | [&](auto str_const, auto pattern_const) { | 765 | 58 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 766 | 58 | dest_column_data, input_rows_count); | 767 | 58 | }, | 768 | 58 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1])); | 769 | | } else { | 770 | | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 771 | | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 772 | | const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]); | 773 | | std::visit( | 774 | | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | | _execute<str_const, pattern_const, start_pos_const>( | 776 | | src_column_string, pattern_column, start_pos_column, | 777 | | dest_column_data, input_rows_count); | 778 | | }, | 779 | | make_bool_variant(col_const[0]), make_bool_variant(col_const[1]), | 780 | | make_bool_variant(col_const[2])); | 781 | | } | 782 | | | 783 | 58 | block.replace_by_position(result, std::move(dest_column_ptr)); | 784 | 58 | return Status::OK(); | 785 | 58 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 748 | 176 | uint32_t result, size_t input_rows_count) const override { | 749 | 176 | DCHECK(arg_count); | 750 | 176 | bool col_const[arg_count]; | 751 | 176 | ColumnPtr argument_columns[arg_count]; | 752 | 704 | for (int i = 0; i < arg_count; ++i) { | 753 | 528 | std::tie(argument_columns[i], col_const[i]) = | 754 | 528 | unpack_if_const(block.get_by_position(arguments[i]).column); | 755 | 528 | } | 756 | | | 757 | 176 | auto dest_column_ptr = ColumnInt32::create(input_rows_count); | 758 | 176 | auto& dest_column_data = dest_column_ptr->get_data(); | 759 | | | 760 | | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 761 | | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 762 | | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 763 | | std::visit( | 764 | | [&](auto str_const, auto pattern_const) { | 765 | | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 766 | | dest_column_data, input_rows_count); | 767 | | }, | 768 | | make_bool_variant(col_const[0]), make_bool_variant(col_const[1])); | 769 | 176 | } else { | 770 | 176 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 771 | 176 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 772 | 176 | const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]); | 773 | 176 | std::visit( | 774 | 176 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 775 | 176 | _execute<str_const, pattern_const, start_pos_const>( | 776 | 176 | src_column_string, pattern_column, start_pos_column, | 777 | 176 | dest_column_data, input_rows_count); | 778 | 176 | }, | 779 | 176 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1]), | 780 | 176 | make_bool_variant(col_const[2])); | 781 | 176 | } | 782 | | | 783 | 176 | block.replace_by_position(result, std::move(dest_column_ptr)); | 784 | 176 | return Status::OK(); | 785 | 176 | } |
|
786 | | |
787 | | private: |
788 | | template <bool src_const, bool pattern_const> |
789 | | void _execute(const ColumnString& src_column_string, const ColumnString& pattern_column, |
790 | 58 | ColumnInt32::Container& dest_column_data, size_t size) const { |
791 | 241 | for (size_t i = 0; i < size; i++) { |
792 | 183 | const StringRef str_ref = |
793 | 183 | src_column_string.get_data_at(index_check_const<src_const>(i)); |
794 | | |
795 | 183 | const StringRef pattern_ref = |
796 | 183 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); |
797 | 183 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); |
798 | 183 | } |
799 | 58 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb0ELb0EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 790 | 32 | ColumnInt32::Container& dest_column_data, size_t size) const { | 791 | 133 | for (size_t i = 0; i < size; i++) { | 792 | 101 | const StringRef str_ref = | 793 | 101 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 794 | | | 795 | 101 | const StringRef pattern_ref = | 796 | 101 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 797 | 101 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); | 798 | 101 | } | 799 | 32 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb0ELb1EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 790 | 13 | ColumnInt32::Container& dest_column_data, size_t size) const { | 791 | 54 | for (size_t i = 0; i < size; i++) { | 792 | 41 | const StringRef str_ref = | 793 | 41 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 794 | | | 795 | 41 | const StringRef pattern_ref = | 796 | 41 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 797 | 41 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); | 798 | 41 | } | 799 | 13 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb1ELb0EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 790 | 13 | ColumnInt32::Container& dest_column_data, size_t size) const { | 791 | 54 | for (size_t i = 0; i < size; i++) { | 792 | 41 | const StringRef str_ref = | 793 | 41 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 794 | | | 795 | 41 | const StringRef pattern_ref = | 796 | 41 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 797 | 41 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); | 798 | 41 | } | 799 | 13 | } |
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb1ELb1EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm |
800 | | |
801 | | template <bool src_const, bool pattern_const, bool start_pos_const> |
802 | | void _execute(const ColumnString& src_column_string, const ColumnString& pattern_column, |
803 | | const ColumnInt32& start_pos_column, ColumnInt32::Container& dest_column_data, |
804 | 176 | size_t size) const { |
805 | 411 | for (size_t i = 0; i < size; i++) { |
806 | 235 | const StringRef str_ref = |
807 | 235 | src_column_string.get_data_at(index_check_const<src_const>(i)); |
808 | 235 | const StringRef pattern_ref = |
809 | 235 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); |
810 | | // 1-based index |
811 | 235 | int32_t start_pos = |
812 | 235 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; |
813 | | |
814 | 235 | const char* p = str_ref.begin(); |
815 | 235 | const char* end = str_ref.end(); |
816 | 235 | int char_size = 0; |
817 | 1.47k | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { |
818 | 1.24k | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; |
819 | 1.24k | } |
820 | 235 | const auto start_byte_len = p - str_ref.begin(); |
821 | | |
822 | 235 | if (start_pos < 0 || start_byte_len >= str_ref.size) { |
823 | 134 | dest_column_data[i] = 0; |
824 | 134 | } else { |
825 | 101 | dest_column_data[i] = |
826 | 101 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); |
827 | 101 | } |
828 | 235 | } |
829 | 176 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb0ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 804 | 36 | size_t size) const { | 805 | 97 | for (size_t i = 0; i < size; i++) { | 806 | 61 | const StringRef str_ref = | 807 | 61 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 808 | 61 | const StringRef pattern_ref = | 809 | 61 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 810 | | // 1-based index | 811 | 61 | int32_t start_pos = | 812 | 61 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 813 | | | 814 | 61 | const char* p = str_ref.begin(); | 815 | 61 | const char* end = str_ref.end(); | 816 | 61 | int char_size = 0; | 817 | 456 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 818 | 395 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 819 | 395 | } | 820 | 61 | const auto start_byte_len = p - str_ref.begin(); | 821 | | | 822 | 61 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 823 | 38 | dest_column_data[i] = 0; | 824 | 38 | } else { | 825 | 23 | dest_column_data[i] = | 826 | 23 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 827 | 23 | } | 828 | 61 | } | 829 | 36 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb0ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 804 | 29 | size_t size) const { | 805 | 78 | for (size_t i = 0; i < size; i++) { | 806 | 49 | const StringRef str_ref = | 807 | 49 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 808 | 49 | const StringRef pattern_ref = | 809 | 49 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 810 | | // 1-based index | 811 | 49 | int32_t start_pos = | 812 | 49 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 813 | | | 814 | 49 | const char* p = str_ref.begin(); | 815 | 49 | const char* end = str_ref.end(); | 816 | 49 | int char_size = 0; | 817 | 242 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 818 | 193 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 819 | 193 | } | 820 | 49 | const auto start_byte_len = p - str_ref.begin(); | 821 | | | 822 | 49 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 823 | 22 | dest_column_data[i] = 0; | 824 | 27 | } else { | 825 | 27 | dest_column_data[i] = | 826 | 27 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 827 | 27 | } | 828 | 49 | } | 829 | 29 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb1ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 804 | 22 | size_t size) const { | 805 | 44 | for (size_t i = 0; i < size; i++) { | 806 | 22 | const StringRef str_ref = | 807 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 808 | 22 | const StringRef pattern_ref = | 809 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 810 | | // 1-based index | 811 | 22 | int32_t start_pos = | 812 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 813 | | | 814 | 22 | const char* p = str_ref.begin(); | 815 | 22 | const char* end = str_ref.end(); | 816 | 22 | int char_size = 0; | 817 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 818 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 819 | 129 | } | 820 | 22 | const auto start_byte_len = p - str_ref.begin(); | 821 | | | 822 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 823 | 14 | dest_column_data[i] = 0; | 824 | 14 | } else { | 825 | 8 | dest_column_data[i] = | 826 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 827 | 8 | } | 828 | 22 | } | 829 | 22 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb1ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 804 | 23 | size_t size) const { | 805 | 60 | for (size_t i = 0; i < size; i++) { | 806 | 37 | const StringRef str_ref = | 807 | 37 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 808 | 37 | const StringRef pattern_ref = | 809 | 37 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 810 | | // 1-based index | 811 | 37 | int32_t start_pos = | 812 | 37 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 813 | | | 814 | 37 | const char* p = str_ref.begin(); | 815 | 37 | const char* end = str_ref.end(); | 816 | 37 | int char_size = 0; | 817 | 177 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 818 | 140 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 819 | 140 | } | 820 | 37 | const auto start_byte_len = p - str_ref.begin(); | 821 | | | 822 | 37 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 823 | 18 | dest_column_data[i] = 0; | 824 | 19 | } else { | 825 | 19 | dest_column_data[i] = | 826 | 19 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 827 | 19 | } | 828 | 37 | } | 829 | 23 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb0ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 804 | 22 | size_t size) const { | 805 | 44 | for (size_t i = 0; i < size; i++) { | 806 | 22 | const StringRef str_ref = | 807 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 808 | 22 | const StringRef pattern_ref = | 809 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 810 | | // 1-based index | 811 | 22 | int32_t start_pos = | 812 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 813 | | | 814 | 22 | const char* p = str_ref.begin(); | 815 | 22 | const char* end = str_ref.end(); | 816 | 22 | int char_size = 0; | 817 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 818 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 819 | 129 | } | 820 | 22 | const auto start_byte_len = p - str_ref.begin(); | 821 | | | 822 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 823 | 14 | dest_column_data[i] = 0; | 824 | 14 | } else { | 825 | 8 | dest_column_data[i] = | 826 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 827 | 8 | } | 828 | 22 | } | 829 | 22 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb0ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 804 | 22 | size_t size) const { | 805 | 44 | for (size_t i = 0; i < size; i++) { | 806 | 22 | const StringRef str_ref = | 807 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 808 | 22 | const StringRef pattern_ref = | 809 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 810 | | // 1-based index | 811 | 22 | int32_t start_pos = | 812 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 813 | | | 814 | 22 | const char* p = str_ref.begin(); | 815 | 22 | const char* end = str_ref.end(); | 816 | 22 | int char_size = 0; | 817 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 818 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 819 | 129 | } | 820 | 22 | const auto start_byte_len = p - str_ref.begin(); | 821 | | | 822 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 823 | 14 | dest_column_data[i] = 0; | 824 | 14 | } else { | 825 | 8 | dest_column_data[i] = | 826 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 827 | 8 | } | 828 | 22 | } | 829 | 22 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb1ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 804 | 22 | size_t size) const { | 805 | 44 | for (size_t i = 0; i < size; i++) { | 806 | 22 | const StringRef str_ref = | 807 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 808 | 22 | const StringRef pattern_ref = | 809 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 810 | | // 1-based index | 811 | 22 | int32_t start_pos = | 812 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 813 | | | 814 | 22 | const char* p = str_ref.begin(); | 815 | 22 | const char* end = str_ref.end(); | 816 | 22 | int char_size = 0; | 817 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 818 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 819 | 129 | } | 820 | 22 | const auto start_byte_len = p - str_ref.begin(); | 821 | | | 822 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 823 | 14 | dest_column_data[i] = 0; | 824 | 14 | } else { | 825 | 8 | dest_column_data[i] = | 826 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 827 | 8 | } | 828 | 22 | } | 829 | 22 | } |
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb1ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm |
830 | | |
831 | 529 | size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const { |
832 | 529 | size_t old_size = pos; |
833 | 529 | size_t str_size = str_ref.size; |
834 | 2.20k | while (pos < str_size && |
835 | 2.20k | memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos, |
836 | 1.98k | (const uint8_t*)pattern_ref.data, pattern_ref.size)) { |
837 | 1.67k | pos++; |
838 | 1.67k | } |
839 | 529 | return pos - old_size; |
840 | 529 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8find_posEmNS_9StringRefES3_ Line | Count | Source | 831 | 291 | size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const { | 832 | 291 | size_t old_size = pos; | 833 | 291 | size_t str_size = str_ref.size; | 834 | 1.05k | while (pos < str_size && | 835 | 1.05k | memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos, | 836 | 933 | (const uint8_t*)pattern_ref.data, pattern_ref.size)) { | 837 | 763 | pos++; | 838 | 763 | } | 839 | 291 | return pos - old_size; | 840 | 291 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8find_posEmNS_9StringRefES3_ Line | Count | Source | 831 | 238 | size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const { | 832 | 238 | size_t old_size = pos; | 833 | 238 | size_t str_size = str_ref.size; | 834 | 1.15k | while (pos < str_size && | 835 | 1.15k | memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos, | 836 | 1.05k | (const uint8_t*)pattern_ref.data, pattern_ref.size)) { | 837 | 914 | pos++; | 838 | 914 | } | 839 | 238 | return pos - old_size; | 840 | 238 | } |
|
841 | | |
842 | 284 | int find_str_count(const StringRef str_ref, StringRef pattern_ref) const { |
843 | 284 | int count = 0; |
844 | 284 | if (str_ref.size == 0 || pattern_ref.size == 0) { |
845 | 64 | return 0; |
846 | 220 | } else { |
847 | 529 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { |
848 | 529 | const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref); |
849 | 529 | if (res_pos == (str_ref.size - str_pos)) { |
850 | 220 | break; // not find |
851 | 220 | } |
852 | 309 | count++; |
853 | 309 | str_pos = str_pos + res_pos + pattern_ref.size; |
854 | 309 | } |
855 | 220 | } |
856 | 220 | return count; |
857 | 284 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE14find_str_countENS_9StringRefES3_ Line | Count | Source | 842 | 183 | int find_str_count(const StringRef str_ref, StringRef pattern_ref) const { | 843 | 183 | int count = 0; | 844 | 183 | if (str_ref.size == 0 || pattern_ref.size == 0) { | 845 | 62 | return 0; | 846 | 121 | } else { | 847 | 291 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 848 | 291 | const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref); | 849 | 291 | if (res_pos == (str_ref.size - str_pos)) { | 850 | 121 | break; // not find | 851 | 121 | } | 852 | 170 | count++; | 853 | 170 | str_pos = str_pos + res_pos + pattern_ref.size; | 854 | 170 | } | 855 | 121 | } | 856 | 121 | return count; | 857 | 183 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE14find_str_countENS_9StringRefES3_ Line | Count | Source | 842 | 101 | int find_str_count(const StringRef str_ref, StringRef pattern_ref) const { | 843 | 101 | int count = 0; | 844 | 101 | if (str_ref.size == 0 || pattern_ref.size == 0) { | 845 | 2 | return 0; | 846 | 99 | } else { | 847 | 238 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 848 | 238 | const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref); | 849 | 238 | if (res_pos == (str_ref.size - str_pos)) { | 850 | 99 | break; // not find | 851 | 99 | } | 852 | 139 | count++; | 853 | 139 | str_pos = str_pos + res_pos + pattern_ref.size; | 854 | 139 | } | 855 | 99 | } | 856 | 99 | return count; | 857 | 101 | } |
|
858 | | }; |
859 | | |
860 | 8 | void register_function_string_search(SimpleFunctionFactory& factory) { |
861 | 8 | factory.register_function<FunctionStringLocatePos>(); |
862 | 8 | factory.register_function<FunctionSplitPart>(); |
863 | 8 | factory.register_function<FunctionSplitByString>(); |
864 | 8 | factory.register_function<FunctionCountSubString<FunctionCountSubStringType::TWO_ARGUMENTS>>(); |
865 | 8 | factory.register_function< |
866 | 8 | FunctionCountSubString<FunctionCountSubStringType::THREE_ARGUMENTS>>(); |
867 | 8 | factory.register_function<FunctionSubstringIndex>(); |
868 | | |
869 | 8 | factory.register_alias(FunctionStringLocatePos::name, "position"); |
870 | 8 | } |
871 | | |
872 | | #include "common/compile_check_avoid_end.h" |
873 | | } // namespace doris |