be/src/exprs/function/function_string_search.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <cstddef> |
19 | | #include <cstring> |
20 | | #include <numeric> |
21 | | #include <string> |
22 | | #include <string_view> |
23 | | #include <vector> |
24 | | |
25 | | #include "common/status.h" |
26 | | #include "core/assert_cast.h" |
27 | | #include "core/block/block.h" |
28 | | #include "core/block/column_numbers.h" |
29 | | #include "core/column/column_array.h" |
30 | | #include "core/column/column_const.h" |
31 | | #include "core/column/column_nullable.h" |
32 | | #include "core/column/column_string.h" |
33 | | #include "core/column/column_vector.h" |
34 | | #include "core/data_type/data_type_array.h" |
35 | | #include "core/data_type/data_type_nullable.h" |
36 | | #include "core/data_type/data_type_number.h" |
37 | | #include "core/data_type/data_type_string.h" |
38 | | #include "core/data_type/define_primitive_type.h" |
39 | | #include "core/memcmp_small.h" |
40 | | #include "core/memcpy_small.h" |
41 | | #include "core/pod_array_fwd.h" |
42 | | #include "core/string_ref.h" |
43 | | #include "exec/common/stringop_substring.h" |
44 | | #include "exec/common/template_helpers.hpp" |
45 | | #include "exec/common/util.hpp" |
46 | | #include "exprs/function/function.h" |
47 | | #include "exprs/function/function_helpers.h" |
48 | | #include "exprs/function/simple_function_factory.h" |
49 | | #include "exprs/function_context.h" |
50 | | #include "util/simd/vstring_function.h" |
51 | | #include "util/string_search.hpp" |
52 | | |
53 | | namespace doris { |
54 | | #include "common/compile_check_avoid_begin.h" |
55 | | |
56 | | class FunctionStringLocatePos : public IFunction { |
57 | | public: |
58 | | static constexpr auto name = "locate"; |
59 | 873 | static FunctionPtr create() { return std::make_shared<FunctionStringLocatePos>(); } |
60 | 0 | String get_name() const override { return name; } |
61 | 0 | size_t get_number_of_arguments() const override { return 3; } |
62 | | |
63 | 864 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
64 | 864 | return std::make_shared<DataTypeInt32>(); |
65 | 864 | } |
66 | | |
67 | 8 | DataTypes get_variadic_argument_types_impl() const override { |
68 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
69 | 8 | std::make_shared<DataTypeInt32>()}; |
70 | 8 | } |
71 | | |
72 | 865 | bool is_variadic() const override { return true; } |
73 | | |
74 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
75 | 651 | uint32_t result, size_t input_rows_count) const override { |
76 | 651 | if (arguments.size() != 3) { |
77 | 0 | return Status::InvalidArgument("Function {} requires 3 arguments, but got {}", |
78 | 0 | get_name(), arguments.size()); |
79 | 0 | } |
80 | 651 | bool col_const[3]; |
81 | 651 | ColumnPtr argument_columns[3]; |
82 | 2.60k | for (int i = 0; i < 3; ++i) { |
83 | 1.95k | std::tie(argument_columns[i], col_const[i]) = |
84 | 1.95k | unpack_if_const(block.get_by_position(arguments[i]).column); |
85 | 1.95k | } |
86 | | |
87 | 651 | const auto* col_left = assert_cast<const ColumnString*>(argument_columns[0].get()); |
88 | 651 | const auto* col_right = assert_cast<const ColumnString*>(argument_columns[1].get()); |
89 | 651 | const auto* col_pos = assert_cast<const ColumnInt32*>(argument_columns[2].get()); |
90 | | |
91 | 651 | ColumnInt32::MutablePtr col_res = ColumnInt32::create(); |
92 | 651 | auto& vec_res = col_res->get_data(); |
93 | 651 | vec_res.resize(block.rows()); |
94 | | |
95 | 651 | const bool is_ascii = col_left->is_ascii() && col_right->is_ascii(); |
96 | | |
97 | 651 | if (col_const[0]) { |
98 | 250 | std::visit( |
99 | 250 | [&](auto is_ascii, auto str_const, auto pos_const) { |
100 | 250 | scalar_search<is_ascii, str_const, pos_const>( |
101 | 250 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, |
102 | 250 | input_rows_count); |
103 | 250 | }, _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESG_SG_EEDaSA_SB_SC_ Line | Count | Source | 99 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 22 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 22 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 22 | input_rows_count); | 103 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESG_SF_IbLb1EEEEDaSA_SB_SC_ Line | Count | Source | 99 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 22 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 22 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 22 | input_rows_count); | 103 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESF_IbLb1EESG_EEDaSA_SB_SC_ Line | Count | Source | 99 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 22 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 22 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 22 | input_rows_count); | 103 | 22 | }, |
Unexecuted instantiation: _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESF_IbLb1EESH_EEDaSA_SB_SC_ _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESF_IbLb0EESH_EEDaSA_SB_SC_ Line | Count | Source | 99 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 60 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 60 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 60 | input_rows_count); | 103 | 60 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESF_IbLb0EESG_EEDaSA_SB_SC_ Line | Count | Source | 99 | 64 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 64 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 64 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 64 | input_rows_count); | 103 | 64 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESG_SF_IbLb0EEEEDaSA_SB_SC_ Line | Count | Source | 99 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 100 | 60 | scalar_search<is_ascii, str_const, pos_const>( | 101 | 60 | col_left->get_data_at(0), col_right, col_pos->get_data(), vec_res, | 102 | 60 | input_rows_count); | 103 | 60 | }, |
Unexecuted instantiation: _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESG_SG_EEDaSA_SB_SC_ |
104 | 250 | make_bool_variant(is_ascii), make_bool_variant(col_const[1]), |
105 | 250 | make_bool_variant(col_const[2])); |
106 | | |
107 | 401 | } else { |
108 | 401 | std::visit( |
109 | 401 | [&](auto is_ascii, auto str_const, auto pos_const) { |
110 | 401 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, |
111 | 401 | col_pos->get_data(), vec_res, |
112 | 401 | input_rows_count); |
113 | 401 | }, _ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESG_SG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 39 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 39 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 39 | col_pos->get_data(), vec_res, | 112 | 39 | input_rows_count); | 113 | 39 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESG_SF_IbLb1EEEEDaSA_SB_SC_ Line | Count | Source | 109 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 22 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 22 | col_pos->get_data(), vec_res, | 112 | 22 | input_rows_count); | 113 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESF_IbLb1EESG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 22 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 22 | col_pos->get_data(), vec_res, | 112 | 22 | input_rows_count); | 113 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb0EESF_IbLb1EESH_EEDaSA_SB_SC_ Line | Count | Source | 109 | 22 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 22 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 22 | col_pos->get_data(), vec_res, | 112 | 22 | input_rows_count); | 113 | 22 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESF_IbLb0EESH_EEDaSA_SB_SC_ Line | Count | Source | 109 | 116 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 116 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 116 | col_pos->get_data(), vec_res, | 112 | 116 | input_rows_count); | 113 | 116 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESF_IbLb0EESG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 60 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 60 | col_pos->get_data(), vec_res, | 112 | 60 | input_rows_count); | 113 | 60 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESG_SF_IbLb0EEEEDaSA_SB_SC_ Line | Count | Source | 109 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 60 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 60 | col_pos->get_data(), vec_res, | 112 | 60 | input_rows_count); | 113 | 60 | }, |
_ZZNK5doris23FunctionStringLocatePos12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E0_clISt17integral_constantIbLb1EESG_SG_EEDaSA_SB_SC_ Line | Count | Source | 109 | 60 | [&](auto is_ascii, auto str_const, auto pos_const) { | 110 | 60 | vector_search<is_ascii, str_const, pos_const>(col_left, col_right, | 111 | 60 | col_pos->get_data(), vec_res, | 112 | 60 | input_rows_count); | 113 | 60 | }, |
|
114 | 401 | make_bool_variant(is_ascii), make_bool_variant(col_const[1]), |
115 | 401 | make_bool_variant(col_const[2])); |
116 | 401 | } |
117 | 651 | block.replace_by_position(result, std::move(col_res)); |
118 | 651 | return Status::OK(); |
119 | 651 | } |
120 | | |
121 | | private: |
122 | | template <bool is_ascii, bool str_const, bool pos_const> |
123 | | void scalar_search(const StringRef& ldata, const ColumnString* col_right, |
124 | | const PaddedPODArray<Int32>& posdata, PaddedPODArray<Int32>& res, |
125 | 250 | size_t size) const { |
126 | 250 | res.resize(size); |
127 | 250 | StringRef substr(ldata.data, ldata.size); |
128 | 250 | StringSearch search {&substr}; |
129 | | |
130 | 521 | for (int i = 0; i < size; ++i) { |
131 | 271 | res[i] = locate_pos<is_ascii>(substr, |
132 | 271 | col_right->get_data_at(index_check_const<str_const>(i)), |
133 | 271 | search, posdata[index_check_const<pos_const>(i)]); |
134 | 271 | } |
135 | 250 | } _ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb0ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 22 | size_t size) const { | 126 | 22 | res.resize(size); | 127 | 22 | StringRef substr(ldata.data, ldata.size); | 128 | 22 | StringSearch search {&substr}; | 129 | | | 130 | 44 | for (int i = 0; i < size; ++i) { | 131 | 22 | res[i] = locate_pos<is_ascii>(substr, | 132 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 22 | } | 135 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb0ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 22 | size_t size) const { | 126 | 22 | res.resize(size); | 127 | 22 | StringRef substr(ldata.data, ldata.size); | 128 | 22 | StringSearch search {&substr}; | 129 | | | 130 | 44 | for (int i = 0; i < size; ++i) { | 131 | 22 | res[i] = locate_pos<is_ascii>(substr, | 132 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 22 | } | 135 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb1ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 22 | size_t size) const { | 126 | 22 | res.resize(size); | 127 | 22 | StringRef substr(ldata.data, ldata.size); | 128 | 22 | StringSearch search {&substr}; | 129 | | | 130 | 44 | for (int i = 0; i < size; ++i) { | 131 | 22 | res[i] = locate_pos<is_ascii>(substr, | 132 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 22 | } | 135 | 22 | } |
Unexecuted instantiation: _ZNK5doris23FunctionStringLocatePos13scalar_searchILb0ELb1ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m _ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb0ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 60 | size_t size) const { | 126 | 60 | res.resize(size); | 127 | 60 | StringRef substr(ldata.data, ldata.size); | 128 | 60 | StringSearch search {&substr}; | 129 | | | 130 | 120 | for (int i = 0; i < size; ++i) { | 131 | 60 | res[i] = locate_pos<is_ascii>(substr, | 132 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 60 | } | 135 | 60 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb0ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 64 | size_t size) const { | 126 | 64 | res.resize(size); | 127 | 64 | StringRef substr(ldata.data, ldata.size); | 128 | 64 | StringSearch search {&substr}; | 129 | | | 130 | 149 | for (int i = 0; i < size; ++i) { | 131 | 85 | res[i] = locate_pos<is_ascii>(substr, | 132 | 85 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 85 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 85 | } | 135 | 64 | } |
_ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb1ELb0EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m Line | Count | Source | 125 | 60 | size_t size) const { | 126 | 60 | res.resize(size); | 127 | 60 | StringRef substr(ldata.data, ldata.size); | 128 | 60 | StringSearch search {&substr}; | 129 | | | 130 | 120 | for (int i = 0; i < size; ++i) { | 131 | 60 | res[i] = locate_pos<is_ascii>(substr, | 132 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 133 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 134 | 60 | } | 135 | 60 | } |
Unexecuted instantiation: _ZNK5doris23FunctionStringLocatePos13scalar_searchILb1ELb1ELb1EEEvRKNS_9StringRefEPKNS_9ColumnStrIjEERKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSD_m |
136 | | |
137 | | template <bool is_ascii, bool str_const, bool pos_const> |
138 | | void vector_search(const ColumnString* col_left, const ColumnString* col_right, |
139 | | const PaddedPODArray<Int32>& posdata, PaddedPODArray<Int32>& res, |
140 | 401 | size_t size) const { |
141 | 401 | res.resize(size); |
142 | 401 | StringSearch search; |
143 | 984 | for (int i = 0; i < size; ++i) { |
144 | 583 | StringRef substr = col_left->get_data_at(i); |
145 | 583 | search.set_pattern(&substr); |
146 | 583 | res[i] = locate_pos<is_ascii>(substr, |
147 | 583 | col_right->get_data_at(index_check_const<str_const>(i)), |
148 | 583 | search, posdata[index_check_const<pos_const>(i)]); |
149 | 583 | } |
150 | 401 | } _ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb0ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 39 | size_t size) const { | 141 | 39 | res.resize(size); | 142 | 39 | StringSearch search; | 143 | 103 | for (int i = 0; i < size; ++i) { | 144 | 64 | StringRef substr = col_left->get_data_at(i); | 145 | 64 | search.set_pattern(&substr); | 146 | 64 | res[i] = locate_pos<is_ascii>(substr, | 147 | 64 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 64 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 64 | } | 150 | 39 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb0ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 22 | size_t size) const { | 141 | 22 | res.resize(size); | 142 | 22 | StringSearch search; | 143 | 44 | for (int i = 0; i < size; ++i) { | 144 | 22 | StringRef substr = col_left->get_data_at(i); | 145 | 22 | search.set_pattern(&substr); | 146 | 22 | res[i] = locate_pos<is_ascii>(substr, | 147 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 22 | } | 150 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb1ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 22 | size_t size) const { | 141 | 22 | res.resize(size); | 142 | 22 | StringSearch search; | 143 | 44 | for (int i = 0; i < size; ++i) { | 144 | 22 | StringRef substr = col_left->get_data_at(i); | 145 | 22 | search.set_pattern(&substr); | 146 | 22 | res[i] = locate_pos<is_ascii>(substr, | 147 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 22 | } | 150 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb0ELb1ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 22 | size_t size) const { | 141 | 22 | res.resize(size); | 142 | 22 | StringSearch search; | 143 | 44 | for (int i = 0; i < size; ++i) { | 144 | 22 | StringRef substr = col_left->get_data_at(i); | 145 | 22 | search.set_pattern(&substr); | 146 | 22 | res[i] = locate_pos<is_ascii>(substr, | 147 | 22 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 22 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 22 | } | 150 | 22 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb0ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 116 | size_t size) const { | 141 | 116 | res.resize(size); | 142 | 116 | StringSearch search; | 143 | 389 | for (int i = 0; i < size; ++i) { | 144 | 273 | StringRef substr = col_left->get_data_at(i); | 145 | 273 | search.set_pattern(&substr); | 146 | 273 | res[i] = locate_pos<is_ascii>(substr, | 147 | 273 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 273 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 273 | } | 150 | 116 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb0ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 60 | size_t size) const { | 141 | 60 | res.resize(size); | 142 | 60 | StringSearch search; | 143 | 120 | for (int i = 0; i < size; ++i) { | 144 | 60 | StringRef substr = col_left->get_data_at(i); | 145 | 60 | search.set_pattern(&substr); | 146 | 60 | res[i] = locate_pos<is_ascii>(substr, | 147 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 60 | } | 150 | 60 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb1ELb0EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 60 | size_t size) const { | 141 | 60 | res.resize(size); | 142 | 60 | StringSearch search; | 143 | 120 | for (int i = 0; i < size; ++i) { | 144 | 60 | StringRef substr = col_left->get_data_at(i); | 145 | 60 | search.set_pattern(&substr); | 146 | 60 | res[i] = locate_pos<is_ascii>(substr, | 147 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 60 | } | 150 | 60 | } |
_ZNK5doris23FunctionStringLocatePos13vector_searchILb1ELb1ELb1EEEvPKNS_9ColumnStrIjEES5_RKNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERSA_m Line | Count | Source | 140 | 60 | size_t size) const { | 141 | 60 | res.resize(size); | 142 | 60 | StringSearch search; | 143 | 120 | for (int i = 0; i < size; ++i) { | 144 | 60 | StringRef substr = col_left->get_data_at(i); | 145 | 60 | search.set_pattern(&substr); | 146 | 60 | res[i] = locate_pos<is_ascii>(substr, | 147 | 60 | col_right->get_data_at(index_check_const<str_const>(i)), | 148 | 60 | search, posdata[index_check_const<pos_const>(i)]); | 149 | 60 | } | 150 | 60 | } |
|
151 | | |
152 | | template <bool is_ascii> |
153 | 854 | int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const { |
154 | 854 | if (str.size == 0 && substr.size == 0 && start_pos == 1) { |
155 | | // BEHAVIOR COMPATIBLE WITH MYSQL |
156 | | // locate('','') locate('','',1) locate('','',2) |
157 | | // 1 1 0 |
158 | 13 | return 1; |
159 | 13 | } |
160 | 841 | if (is_ascii) { |
161 | 645 | return locate_pos_ascii(substr, str, search, start_pos); |
162 | 645 | } else { |
163 | 196 | return locate_pos_utf8(substr, str, search, start_pos); |
164 | 196 | } |
165 | 841 | } _ZNK5doris23FunctionStringLocatePos10locate_posILb0EEEiNS_9StringRefES2_RNS_12StringSearchEi Line | Count | Source | 153 | 196 | int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const { | 154 | 196 | if (str.size == 0 && substr.size == 0 && start_pos == 1) { | 155 | | // BEHAVIOR COMPATIBLE WITH MYSQL | 156 | | // locate('','') locate('','',1) locate('','',2) | 157 | | // 1 1 0 | 158 | 0 | return 1; | 159 | 0 | } | 160 | 196 | if (is_ascii) { | 161 | 0 | return locate_pos_ascii(substr, str, search, start_pos); | 162 | 196 | } else { | 163 | 196 | return locate_pos_utf8(substr, str, search, start_pos); | 164 | 196 | } | 165 | 196 | } |
_ZNK5doris23FunctionStringLocatePos10locate_posILb1EEEiNS_9StringRefES2_RNS_12StringSearchEi Line | Count | Source | 153 | 658 | int locate_pos(StringRef substr, StringRef str, StringSearch& search, int start_pos) const { | 154 | 658 | if (str.size == 0 && substr.size == 0 && start_pos == 1) { | 155 | | // BEHAVIOR COMPATIBLE WITH MYSQL | 156 | | // locate('','') locate('','',1) locate('','',2) | 157 | | // 1 1 0 | 158 | 13 | return 1; | 159 | 13 | } | 160 | 645 | if (is_ascii) { | 161 | 645 | return locate_pos_ascii(substr, str, search, start_pos); | 162 | 645 | } else { | 163 | 0 | return locate_pos_utf8(substr, str, search, start_pos); | 164 | 0 | } | 165 | 645 | } |
|
166 | | |
167 | | int locate_pos_utf8(StringRef substr, StringRef str, StringSearch& search, |
168 | 196 | int start_pos) const { |
169 | 196 | std::vector<size_t> index; |
170 | 196 | size_t char_len = simd::VStringFunctions::get_char_len(str.data, str.size, index); |
171 | 196 | if (start_pos <= 0 || start_pos > char_len) { |
172 | 49 | return 0; |
173 | 49 | } |
174 | 147 | if (substr.size == 0) { |
175 | 18 | return start_pos; |
176 | 18 | } |
177 | | // Input start_pos starts from 1. |
178 | 129 | StringRef adjusted_str(str.data + index[start_pos - 1], str.size - index[start_pos - 1]); |
179 | 129 | int32_t match_pos = search.search(&adjusted_str); |
180 | 129 | if (match_pos >= 0) { |
181 | | // Hive returns the position in the original string starting from 1. |
182 | 111 | return start_pos + simd::VStringFunctions::get_char_len(adjusted_str.data, match_pos); |
183 | 111 | } else { |
184 | 18 | return 0; |
185 | 18 | } |
186 | 129 | } |
187 | | |
188 | | int locate_pos_ascii(StringRef substr, StringRef str, StringSearch& search, |
189 | 645 | int start_pos) const { |
190 | 645 | if (start_pos <= 0 || start_pos > str.size) { |
191 | 412 | return 0; |
192 | 412 | } |
193 | 233 | if (substr.size == 0) { |
194 | 38 | return start_pos; |
195 | 38 | } |
196 | | // Input start_pos starts from 1. |
197 | 195 | StringRef adjusted_str(str.data + start_pos - 1, str.size - start_pos + 1); |
198 | 195 | int32_t match_pos = search.search(&adjusted_str); |
199 | 195 | if (match_pos >= 0) { |
200 | | // Hive returns the position in the original string starting from 1. |
201 | 62 | return start_pos + match_pos; |
202 | 133 | } else { |
203 | 133 | return 0; |
204 | 133 | } |
205 | 195 | } |
206 | | }; |
207 | | |
208 | | class FunctionSplitPart : public IFunction { |
209 | | public: |
210 | | static constexpr auto name = "split_part"; |
211 | 145 | static FunctionPtr create() { return std::make_shared<FunctionSplitPart>(); } |
212 | 1 | String get_name() const override { return name; } |
213 | 136 | size_t get_number_of_arguments() const override { return 3; } |
214 | | |
215 | 136 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
216 | 136 | return make_nullable(std::make_shared<DataTypeString>()); |
217 | 136 | } |
218 | | |
219 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
220 | 153 | uint32_t result, size_t input_rows_count) const override { |
221 | 153 | DCHECK_EQ(arguments.size(), 3); |
222 | | |
223 | 153 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
224 | | // Create a zero column to simply implement |
225 | 153 | auto const_null_map = ColumnUInt8::create(input_rows_count, 0); |
226 | 153 | auto res = ColumnString::create(); |
227 | | |
228 | 153 | auto& null_map_data = null_map->get_data(); |
229 | 153 | auto& res_offsets = res->get_offsets(); |
230 | 153 | auto& res_chars = res->get_chars(); |
231 | 153 | res_offsets.resize(input_rows_count); |
232 | | |
233 | 153 | const size_t argument_size = arguments.size(); |
234 | 153 | std::vector<ColumnPtr> argument_columns(argument_size); |
235 | 612 | for (size_t i = 0; i < argument_size; ++i) { |
236 | 459 | argument_columns[i] = |
237 | 459 | block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); |
238 | 459 | if (const auto* nullable = |
239 | 459 | check_and_get_column<const ColumnNullable>(*argument_columns[i])) { |
240 | | // Danger: Here must dispose the null map data first! Because |
241 | | // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem |
242 | | // of column nullable mem of null map |
243 | 0 | VectorizedUtils::update_null_map(null_map->get_data(), |
244 | 0 | nullable->get_null_map_data()); |
245 | 0 | argument_columns[i] = nullable->get_nested_column_ptr(); |
246 | 0 | } |
247 | 459 | } |
248 | | |
249 | 153 | const auto* str_col = assert_cast<const ColumnString*>(argument_columns[0].get()); |
250 | | |
251 | 153 | const auto* delimiter_col = assert_cast<const ColumnString*>(argument_columns[1].get()); |
252 | | |
253 | 153 | const auto* part_num_col = assert_cast<const ColumnInt32*>(argument_columns[2].get()); |
254 | 153 | const auto& part_num_col_data = part_num_col->get_data(); |
255 | | |
256 | 400 | for (size_t i = 0; i < input_rows_count; ++i) { |
257 | 247 | if (part_num_col_data[i] == 0) { |
258 | 11 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
259 | 11 | continue; |
260 | 11 | } |
261 | | |
262 | 236 | auto delimiter = delimiter_col->get_data_at(i); |
263 | 236 | auto delimiter_str = delimiter_col->get_data_at(i).to_string(); |
264 | 236 | auto part_number = part_num_col_data[i]; |
265 | 236 | auto str = str_col->get_data_at(i); |
266 | 236 | if (delimiter.size == 0) { |
267 | 9 | StringOP::push_empty_string(i, res_chars, res_offsets); |
268 | 9 | continue; |
269 | 9 | } |
270 | | |
271 | 227 | if (part_number > 0) { |
272 | 188 | if (delimiter.size == 1) { |
273 | | // If delimiter is a char, use memchr to split |
274 | 155 | int32_t pre_offset = -1; |
275 | 155 | int32_t offset = -1; |
276 | 155 | int32_t num = 0; |
277 | 261 | while (num < part_number) { |
278 | 217 | pre_offset = offset; |
279 | 217 | size_t n = str.size - offset - 1; |
280 | 217 | const char* pos = reinterpret_cast<const char*>( |
281 | 217 | memchr(str.data + offset + 1, delimiter_str[0], n)); |
282 | 217 | if (pos != nullptr) { |
283 | 106 | offset = pos - str.data; |
284 | 106 | num++; |
285 | 111 | } else { |
286 | 111 | offset = str.size; |
287 | 111 | num = (num == 0) ? 0 : num + 1; |
288 | 111 | break; |
289 | 111 | } |
290 | 217 | } |
291 | | |
292 | 155 | if (num == part_number) { |
293 | 71 | StringOP::push_value_string( |
294 | 71 | std::string_view { |
295 | 71 | reinterpret_cast<const char*>(str.data + pre_offset + 1), |
296 | 71 | (size_t)offset - pre_offset - 1}, |
297 | 71 | i, res_chars, res_offsets); |
298 | 84 | } else { |
299 | 84 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
300 | 84 | } |
301 | 155 | } else { |
302 | | // If delimiter is a string, use memmem to split |
303 | 33 | int32_t pre_offset = -delimiter.size; |
304 | 33 | int32_t offset = -delimiter.size; |
305 | 33 | int32_t num = 0; |
306 | 68 | while (num < part_number) { |
307 | 54 | pre_offset = offset; |
308 | 54 | size_t n = str.size - offset - delimiter.size; |
309 | 54 | char* pos = |
310 | 54 | reinterpret_cast<char*>(memmem(str.data + offset + delimiter.size, |
311 | 54 | n, delimiter.data, delimiter.size)); |
312 | 54 | if (pos != nullptr) { |
313 | 35 | offset = pos - str.data; |
314 | 35 | num++; |
315 | 35 | } else { |
316 | 19 | offset = str.size; |
317 | 19 | num = (num == 0) ? 0 : num + 1; |
318 | 19 | break; |
319 | 19 | } |
320 | 54 | } |
321 | | |
322 | 33 | if (num == part_number) { |
323 | 27 | StringOP::push_value_string( |
324 | 27 | std::string_view {reinterpret_cast<const char*>( |
325 | 27 | str.data + pre_offset + delimiter.size), |
326 | 27 | (size_t)offset - pre_offset - delimiter.size}, |
327 | 27 | i, res_chars, res_offsets); |
328 | 27 | } else { |
329 | 6 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
330 | 6 | } |
331 | 33 | } |
332 | 188 | } else { |
333 | 39 | part_number = -part_number; |
334 | 39 | auto str_str = str.to_string(); |
335 | 39 | int32_t offset = str.size; |
336 | 39 | int32_t pre_offset = offset; |
337 | 39 | int32_t num = 0; |
338 | 39 | auto substr = str_str; |
339 | 83 | while (num <= part_number && offset >= 0) { |
340 | 83 | offset = (int)substr.rfind(delimiter, offset); |
341 | 83 | if (offset != -1) { |
342 | 62 | if (++num == part_number) { |
343 | 18 | break; |
344 | 18 | } |
345 | 44 | pre_offset = offset; |
346 | 44 | offset = offset - 1; |
347 | 44 | substr = str_str.substr(0, pre_offset); |
348 | 44 | } else { |
349 | 21 | break; |
350 | 21 | } |
351 | 83 | } |
352 | 39 | num = (offset == -1 && num != 0) ? num + 1 : num; |
353 | | |
354 | 39 | if (num == part_number) { |
355 | 24 | if (offset == -1) { |
356 | 6 | StringOP::push_value_string( |
357 | 6 | std::string_view {reinterpret_cast<const char*>(str.data), |
358 | 6 | (size_t)pre_offset}, |
359 | 6 | i, res_chars, res_offsets); |
360 | 18 | } else { |
361 | 18 | StringOP::push_value_string( |
362 | 18 | std::string_view {str_str.substr( |
363 | 18 | offset + delimiter.size, |
364 | 18 | (size_t)pre_offset - offset - delimiter.size)}, |
365 | 18 | i, res_chars, res_offsets); |
366 | 18 | } |
367 | 24 | } else { |
368 | 15 | StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); |
369 | 15 | } |
370 | 39 | } |
371 | 227 | } |
372 | | |
373 | 153 | block.get_by_position(result).column = |
374 | 153 | ColumnNullable::create(std::move(res), std::move(null_map)); |
375 | 153 | return Status::OK(); |
376 | 153 | } |
377 | | }; |
378 | | |
379 | | class FunctionSubstringIndex : public IFunction { |
380 | | public: |
381 | | static constexpr auto name = "substring_index"; |
382 | 112 | static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); } |
383 | 1 | String get_name() const override { return name; } |
384 | 103 | size_t get_number_of_arguments() const override { return 3; } |
385 | | |
386 | 103 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
387 | 103 | return std::make_shared<DataTypeString>(); |
388 | 103 | } |
389 | | |
390 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
391 | 107 | uint32_t result, size_t input_rows_count) const override { |
392 | 107 | DCHECK_EQ(arguments.size(), 3); |
393 | | |
394 | | // Create a zero column to simply implement |
395 | 107 | auto res = ColumnString::create(); |
396 | | |
397 | 107 | auto& res_offsets = res->get_offsets(); |
398 | 107 | auto& res_chars = res->get_chars(); |
399 | 107 | res_offsets.resize(input_rows_count); |
400 | 107 | ColumnPtr content_column; |
401 | 107 | bool content_const = false; |
402 | 107 | std::tie(content_column, content_const) = |
403 | 107 | unpack_if_const(block.get_by_position(arguments[0]).column); |
404 | | |
405 | 107 | const auto* str_col = assert_cast<const ColumnString*>(content_column.get()); |
406 | | |
407 | | // Handle both constant and non-constant delimiter parameters |
408 | 107 | ColumnPtr delimiter_column_ptr; |
409 | 107 | bool delimiter_const = false; |
410 | 107 | std::tie(delimiter_column_ptr, delimiter_const) = |
411 | 107 | unpack_if_const(block.get_by_position(arguments[1]).column); |
412 | 107 | const auto* delimiter_col = assert_cast<const ColumnString*>(delimiter_column_ptr.get()); |
413 | | |
414 | 107 | ColumnPtr part_num_column_ptr; |
415 | 107 | bool part_num_const = false; |
416 | 107 | std::tie(part_num_column_ptr, part_num_const) = |
417 | 107 | unpack_if_const(block.get_by_position(arguments[2]).column); |
418 | 107 | const ColumnInt32* part_num_col = |
419 | 107 | assert_cast<const ColumnInt32*>(part_num_column_ptr.get()); |
420 | | |
421 | | // For constant multi-character delimiters, create StringRef and StringSearch only once |
422 | 107 | std::optional<StringRef> const_delimiter_ref; |
423 | 107 | std::optional<StringSearch> const_search; |
424 | 107 | if (delimiter_const && delimiter_col->get_data_at(0).size > 1) { |
425 | 0 | const_delimiter_ref.emplace(delimiter_col->get_data_at(0)); |
426 | 0 | const_search.emplace(&const_delimiter_ref.value()); |
427 | 0 | } |
428 | | |
429 | 297 | for (size_t i = 0; i < input_rows_count; ++i) { |
430 | 190 | auto str = str_col->get_data_at(content_const ? 0 : i); |
431 | 190 | auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i); |
432 | 190 | int32_t delimiter_size = delimiter.size; |
433 | | |
434 | 190 | auto part_number = part_num_col->get_element(part_num_const ? 0 : i); |
435 | | |
436 | 190 | if (part_number == 0 || delimiter_size == 0) { |
437 | 7 | StringOP::push_empty_string(i, res_chars, res_offsets); |
438 | 7 | continue; |
439 | 7 | } |
440 | | |
441 | 183 | if (part_number > 0) { |
442 | 128 | if (delimiter_size == 1) { |
443 | 85 | int32_t offset = -1; |
444 | 85 | int32_t num = 0; |
445 | 137 | while (num < part_number) { |
446 | 117 | size_t n = str.size - offset - 1; |
447 | 117 | const char* pos = reinterpret_cast<const char*>( |
448 | 117 | memchr(str.data + offset + 1, delimiter.data[0], n)); |
449 | 117 | if (pos != nullptr) { |
450 | 52 | offset = pos - str.data; |
451 | 52 | num++; |
452 | 65 | } else { |
453 | 65 | offset = str.size; |
454 | 65 | num = (num == 0) ? 0 : num + 1; |
455 | 65 | break; |
456 | 65 | } |
457 | 117 | } |
458 | | |
459 | 85 | if (num == part_number) { |
460 | 25 | StringOP::push_value_string( |
461 | 25 | std::string_view {reinterpret_cast<const char*>(str.data), |
462 | 25 | (size_t)offset}, |
463 | 25 | i, res_chars, res_offsets); |
464 | 60 | } else { |
465 | 60 | StringOP::push_value_string(std::string_view(str.data, str.size), i, |
466 | 60 | res_chars, res_offsets); |
467 | 60 | } |
468 | 85 | } else { |
469 | | // For multi-character delimiters |
470 | | // Use pre-created StringRef and StringSearch for constant delimiters |
471 | 43 | StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value() |
472 | 43 | : StringRef(delimiter); |
473 | 43 | const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr; |
474 | 43 | StringSearch local_search(&delimiter_ref); |
475 | 43 | if (!search_ptr) { |
476 | 43 | search_ptr = &local_search; |
477 | 43 | } |
478 | | |
479 | 43 | int32_t offset = -delimiter_size; |
480 | 43 | int32_t num = 0; |
481 | 86 | while (num < part_number) { |
482 | 59 | size_t n = str.size - offset - delimiter_size; |
483 | | // search first match delimter_ref index from src string among str_offset to end |
484 | 59 | const char* pos = search_ptr->search(str.data + offset + delimiter_size, n); |
485 | 59 | if (pos < str.data + str.size) { |
486 | 43 | offset = pos - str.data; |
487 | 43 | num++; |
488 | 43 | } else { |
489 | 16 | offset = str.size; |
490 | 16 | num = (num == 0) ? 0 : num + 1; |
491 | 16 | break; |
492 | 16 | } |
493 | 59 | } |
494 | | |
495 | 43 | if (num == part_number) { |
496 | 40 | StringOP::push_value_string( |
497 | 40 | std::string_view {reinterpret_cast<const char*>(str.data), |
498 | 40 | (size_t)offset}, |
499 | 40 | i, res_chars, res_offsets); |
500 | 40 | } else { |
501 | 3 | StringOP::push_value_string(std::string_view(str.data, str.size), i, |
502 | 3 | res_chars, res_offsets); |
503 | 3 | } |
504 | 43 | } |
505 | 128 | } else { |
506 | 55 | int neg_part_number = -part_number; |
507 | 55 | auto str_str = str.to_string(); |
508 | 55 | int32_t offset = str.size; |
509 | 55 | int32_t pre_offset = offset; |
510 | 55 | int32_t num = 0; |
511 | 55 | auto substr = str_str; |
512 | | |
513 | | // Use pre-created StringRef for constant delimiters |
514 | 55 | StringRef delimiter_str = |
515 | 55 | const_delimiter_ref |
516 | 55 | ? const_delimiter_ref.value() |
517 | 55 | : StringRef(reinterpret_cast<const char*>(delimiter.data), |
518 | 55 | delimiter.size); |
519 | | |
520 | 79 | while (num <= neg_part_number && offset >= 0) { |
521 | 79 | offset = (int)substr.rfind(delimiter_str, offset); |
522 | 79 | if (offset != -1) { |
523 | 63 | if (++num == neg_part_number) { |
524 | 39 | break; |
525 | 39 | } |
526 | 24 | pre_offset = offset; |
527 | 24 | offset = offset - 1; |
528 | 24 | substr = str_str.substr(0, pre_offset); |
529 | 24 | } else { |
530 | 16 | break; |
531 | 16 | } |
532 | 79 | } |
533 | 55 | num = (offset == -1 && num != 0) ? num + 1 : num; |
534 | | |
535 | 55 | if (num == neg_part_number) { |
536 | 43 | if (offset == -1) { |
537 | 4 | StringOP::push_value_string(std::string_view(str.data, str.size), i, |
538 | 4 | res_chars, res_offsets); |
539 | 39 | } else { |
540 | 39 | StringOP::push_value_string( |
541 | 39 | std::string_view {str.data + offset + delimiter_size, |
542 | 39 | str.size - offset - delimiter_size}, |
543 | 39 | i, res_chars, res_offsets); |
544 | 39 | } |
545 | 43 | } else { |
546 | 12 | StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, |
547 | 12 | res_offsets); |
548 | 12 | } |
549 | 55 | } |
550 | 183 | } |
551 | | |
552 | 107 | block.get_by_position(result).column = std::move(res); |
553 | 107 | return Status::OK(); |
554 | 107 | } |
555 | | }; |
556 | | |
557 | | class FunctionSplitByString : public IFunction { |
558 | | public: |
559 | | static constexpr auto name = "split_by_string"; |
560 | | |
561 | 135 | static FunctionPtr create() { return std::make_shared<FunctionSplitByString>(); } |
562 | | using NullMapType = PaddedPODArray<UInt8>; |
563 | | |
564 | 1 | String get_name() const override { return name; } |
565 | | |
566 | 127 | bool is_variadic() const override { return false; } |
567 | | |
568 | 126 | size_t get_number_of_arguments() const override { return 2; } |
569 | | |
570 | 126 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
571 | 126 | DCHECK(is_string_type(arguments[0]->get_primitive_type())) |
572 | 0 | << "first argument for function: " << name << " should be string" |
573 | 0 | << " and arguments[0] is " << arguments[0]->get_name(); |
574 | 126 | DCHECK(is_string_type(arguments[1]->get_primitive_type())) |
575 | 0 | << "second argument for function: " << name << " should be string" |
576 | 0 | << " and arguments[1] is " << arguments[1]->get_name(); |
577 | 126 | return std::make_shared<DataTypeArray>(make_nullable(arguments[0])); |
578 | 126 | } |
579 | | |
580 | | Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, |
581 | 157 | uint32_t result, size_t input_rows_count) const override { |
582 | 157 | DCHECK_EQ(arguments.size(), 2); |
583 | | |
584 | 157 | const auto& [src_column, left_const] = |
585 | 157 | unpack_if_const(block.get_by_position(arguments[0]).column); |
586 | 157 | const auto& [right_column, right_const] = |
587 | 157 | unpack_if_const(block.get_by_position(arguments[1]).column); |
588 | | |
589 | 157 | DataTypePtr right_column_type = block.get_by_position(arguments[1]).type; |
590 | 157 | DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; |
591 | 157 | auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), |
592 | 157 | ColumnArray::ColumnOffsets::create()); |
593 | | |
594 | 157 | dest_column_ptr->resize(0); |
595 | 157 | auto& dest_offsets = dest_column_ptr->get_offsets(); |
596 | | |
597 | 157 | auto& dest_nullable_col = assert_cast<ColumnNullable&>(dest_column_ptr->get_data()); |
598 | 157 | auto* dest_nested_column = dest_nullable_col.get_nested_column_ptr().get(); |
599 | | |
600 | 157 | const auto* col_str = assert_cast<const ColumnString*>(src_column.get()); |
601 | | |
602 | 157 | const auto* col_delimiter = assert_cast<const ColumnString*>(right_column.get()); |
603 | | |
604 | 157 | std::visit( |
605 | 157 | [&](auto src_const, auto delimiter_const) { |
606 | 157 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, |
607 | 157 | *dest_nested_column, dest_offsets, |
608 | 157 | input_rows_count); |
609 | 157 | }, _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESF_EEDaSA_SB_ Line | Count | Source | 605 | 55 | [&](auto src_const, auto delimiter_const) { | 606 | 55 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, | 607 | 55 | *dest_nested_column, dest_offsets, | 608 | 55 | input_rows_count); | 609 | 55 | }, |
_ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESE_IbLb1EEEEDaSA_SB_ Line | Count | Source | 605 | 94 | [&](auto src_const, auto delimiter_const) { | 606 | 94 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, | 607 | 94 | *dest_nested_column, dest_offsets, | 608 | 94 | input_rows_count); | 609 | 94 | }, |
_ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESE_IbLb0EEEEDaSA_SB_ Line | Count | Source | 605 | 8 | [&](auto src_const, auto delimiter_const) { | 606 | 8 | _execute<src_const, delimiter_const>(*col_str, *col_delimiter, | 607 | 8 | *dest_nested_column, dest_offsets, | 608 | 8 | input_rows_count); | 609 | 8 | }, |
Unexecuted instantiation: _ZZNK5doris21FunctionSplitByString12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESF_EEDaSA_SB_ |
610 | 157 | make_bool_variant(left_const), make_bool_variant(right_const)); |
611 | | |
612 | | // all elements in dest_nested_column are not null |
613 | 157 | dest_nullable_col.get_null_map_column().get_data().resize_fill(dest_nested_column->size(), |
614 | 157 | false); |
615 | 157 | block.replace_by_position(result, std::move(dest_column_ptr)); |
616 | | |
617 | 157 | return Status::OK(); |
618 | 157 | } |
619 | | |
620 | | private: |
621 | | template <bool src_const, bool delimiter_const> |
622 | | void _execute(const ColumnString& src_column_string, const ColumnString& delimiter_column, |
623 | | IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, |
624 | 157 | size_t size) const { |
625 | 157 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); |
626 | 157 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); |
627 | 157 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); |
628 | 157 | column_string_chars.reserve(0); |
629 | | |
630 | 157 | ColumnArray::Offset64 string_pos = 0; |
631 | 157 | ColumnArray::Offset64 dest_pos = 0; |
632 | | |
633 | 157 | StringSearch search; |
634 | 157 | StringRef delimiter_ref_for_search; |
635 | | |
636 | 157 | if constexpr (delimiter_const) { |
637 | 94 | delimiter_ref_for_search = delimiter_column.get_data_at(0); |
638 | 94 | search.set_pattern(&delimiter_ref_for_search); |
639 | 94 | } |
640 | | |
641 | 1.24k | for (size_t i = 0; i < size; i++) { |
642 | 1.08k | const StringRef str_ref = |
643 | 1.08k | src_column_string.get_data_at(index_check_const<src_const>(i)); |
644 | 1.08k | const StringRef delimiter_ref = |
645 | 1.08k | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); |
646 | | |
647 | 1.08k | if (str_ref.size == 0) { |
648 | 164 | dest_offsets.push_back(dest_pos); |
649 | 164 | continue; |
650 | 164 | } |
651 | 919 | if (delimiter_ref.size == 0) { |
652 | 27 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, |
653 | 27 | string_pos, dest_pos); |
654 | 892 | } else { |
655 | 892 | if constexpr (!delimiter_const) { |
656 | 51 | search.set_pattern(&delimiter_ref); |
657 | 51 | } |
658 | 51.7k | for (size_t str_pos = 0; str_pos <= str_ref.size;) { |
659 | 50.8k | const size_t str_offset = str_pos; |
660 | 50.8k | const size_t old_size = column_string_chars.size(); |
661 | | // search first match delimter_ref index from src string among str_offset to end |
662 | 50.8k | const char* result_start = |
663 | 50.8k | search.search(str_ref.data + str_offset, str_ref.size - str_offset); |
664 | | // compute split part size |
665 | 50.8k | const size_t split_part_size = result_start - str_ref.data - str_offset; |
666 | | // save dist string split part |
667 | 50.8k | if (split_part_size > 0) { |
668 | 50.2k | const size_t new_size = old_size + split_part_size; |
669 | 50.2k | column_string_chars.resize(new_size); |
670 | 50.2k | memcpy_small_allow_read_write_overflow15( |
671 | 50.2k | column_string_chars.data() + old_size, str_ref.data + str_offset, |
672 | 50.2k | split_part_size); |
673 | | // add dist string offset |
674 | 50.2k | string_pos += split_part_size; |
675 | 50.2k | } |
676 | 50.8k | column_string_offsets.push_back(string_pos); |
677 | | // array offset + 1 |
678 | 50.8k | dest_pos++; |
679 | | // add src string str_pos to next search start |
680 | 50.8k | str_pos += split_part_size + delimiter_ref.size; |
681 | 50.8k | } |
682 | 892 | } |
683 | 919 | dest_offsets.push_back(dest_pos); |
684 | 919 | } |
685 | 157 | } _ZNK5doris21FunctionSplitByString8_executeILb0ELb0EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 624 | 55 | size_t size) const { | 625 | 55 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); | 626 | 55 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | 627 | 55 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | 628 | 55 | column_string_chars.reserve(0); | 629 | | | 630 | 55 | ColumnArray::Offset64 string_pos = 0; | 631 | 55 | ColumnArray::Offset64 dest_pos = 0; | 632 | | | 633 | 55 | StringSearch search; | 634 | 55 | StringRef delimiter_ref_for_search; | 635 | | | 636 | | if constexpr (delimiter_const) { | 637 | | delimiter_ref_for_search = delimiter_column.get_data_at(0); | 638 | | search.set_pattern(&delimiter_ref_for_search); | 639 | | } | 640 | | | 641 | 130 | for (size_t i = 0; i < size; i++) { | 642 | 75 | const StringRef str_ref = | 643 | 75 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 644 | 75 | const StringRef delimiter_ref = | 645 | 75 | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); | 646 | | | 647 | 75 | if (str_ref.size == 0) { | 648 | 13 | dest_offsets.push_back(dest_pos); | 649 | 13 | continue; | 650 | 13 | } | 651 | 62 | if (delimiter_ref.size == 0) { | 652 | 11 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, | 653 | 11 | string_pos, dest_pos); | 654 | 51 | } else { | 655 | 51 | if constexpr (!delimiter_const) { | 656 | 51 | search.set_pattern(&delimiter_ref); | 657 | 51 | } | 658 | 214 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 659 | 163 | const size_t str_offset = str_pos; | 660 | 163 | const size_t old_size = column_string_chars.size(); | 661 | | // search first match delimter_ref index from src string among str_offset to end | 662 | 163 | const char* result_start = | 663 | 163 | search.search(str_ref.data + str_offset, str_ref.size - str_offset); | 664 | | // compute split part size | 665 | 163 | const size_t split_part_size = result_start - str_ref.data - str_offset; | 666 | | // save dist string split part | 667 | 163 | if (split_part_size > 0) { | 668 | 122 | const size_t new_size = old_size + split_part_size; | 669 | 122 | column_string_chars.resize(new_size); | 670 | 122 | memcpy_small_allow_read_write_overflow15( | 671 | 122 | column_string_chars.data() + old_size, str_ref.data + str_offset, | 672 | 122 | split_part_size); | 673 | | // add dist string offset | 674 | 122 | string_pos += split_part_size; | 675 | 122 | } | 676 | 163 | column_string_offsets.push_back(string_pos); | 677 | | // array offset + 1 | 678 | 163 | dest_pos++; | 679 | | // add src string str_pos to next search start | 680 | 163 | str_pos += split_part_size + delimiter_ref.size; | 681 | 163 | } | 682 | 51 | } | 683 | 62 | dest_offsets.push_back(dest_pos); | 684 | 62 | } | 685 | 55 | } |
_ZNK5doris21FunctionSplitByString8_executeILb0ELb1EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 624 | 94 | size_t size) const { | 625 | 94 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); | 626 | 94 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | 627 | 94 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | 628 | 94 | column_string_chars.reserve(0); | 629 | | | 630 | 94 | ColumnArray::Offset64 string_pos = 0; | 631 | 94 | ColumnArray::Offset64 dest_pos = 0; | 632 | | | 633 | 94 | StringSearch search; | 634 | 94 | StringRef delimiter_ref_for_search; | 635 | | | 636 | 94 | if constexpr (delimiter_const) { | 637 | 94 | delimiter_ref_for_search = delimiter_column.get_data_at(0); | 638 | 94 | search.set_pattern(&delimiter_ref_for_search); | 639 | 94 | } | 640 | | | 641 | 1.07k | for (size_t i = 0; i < size; i++) { | 642 | 984 | const StringRef str_ref = | 643 | 984 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 644 | 984 | const StringRef delimiter_ref = | 645 | 984 | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); | 646 | | | 647 | 984 | if (str_ref.size == 0) { | 648 | 135 | dest_offsets.push_back(dest_pos); | 649 | 135 | continue; | 650 | 135 | } | 651 | 849 | if (delimiter_ref.size == 0) { | 652 | 8 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, | 653 | 8 | string_pos, dest_pos); | 654 | 841 | } else { | 655 | | if constexpr (!delimiter_const) { | 656 | | search.set_pattern(&delimiter_ref); | 657 | | } | 658 | 51.5k | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 659 | 50.7k | const size_t str_offset = str_pos; | 660 | 50.7k | const size_t old_size = column_string_chars.size(); | 661 | | // search first match delimter_ref index from src string among str_offset to end | 662 | 50.7k | const char* result_start = | 663 | 50.7k | search.search(str_ref.data + str_offset, str_ref.size - str_offset); | 664 | | // compute split part size | 665 | 50.7k | const size_t split_part_size = result_start - str_ref.data - str_offset; | 666 | | // save dist string split part | 667 | 50.7k | if (split_part_size > 0) { | 668 | 50.1k | const size_t new_size = old_size + split_part_size; | 669 | 50.1k | column_string_chars.resize(new_size); | 670 | 50.1k | memcpy_small_allow_read_write_overflow15( | 671 | 50.1k | column_string_chars.data() + old_size, str_ref.data + str_offset, | 672 | 50.1k | split_part_size); | 673 | | // add dist string offset | 674 | 50.1k | string_pos += split_part_size; | 675 | 50.1k | } | 676 | 50.7k | column_string_offsets.push_back(string_pos); | 677 | | // array offset + 1 | 678 | 50.7k | dest_pos++; | 679 | | // add src string str_pos to next search start | 680 | 50.7k | str_pos += split_part_size + delimiter_ref.size; | 681 | 50.7k | } | 682 | 841 | } | 683 | 849 | dest_offsets.push_back(dest_pos); | 684 | 849 | } | 685 | 94 | } |
_ZNK5doris21FunctionSplitByString8_executeILb1ELb0EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 624 | 8 | size_t size) const { | 625 | 8 | auto& dest_column_string = assert_cast<ColumnString&>(dest_nested_column); | 626 | 8 | ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | 627 | 8 | ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | 628 | 8 | column_string_chars.reserve(0); | 629 | | | 630 | 8 | ColumnArray::Offset64 string_pos = 0; | 631 | 8 | ColumnArray::Offset64 dest_pos = 0; | 632 | | | 633 | 8 | StringSearch search; | 634 | 8 | StringRef delimiter_ref_for_search; | 635 | | | 636 | | if constexpr (delimiter_const) { | 637 | | delimiter_ref_for_search = delimiter_column.get_data_at(0); | 638 | | search.set_pattern(&delimiter_ref_for_search); | 639 | | } | 640 | | | 641 | 32 | for (size_t i = 0; i < size; i++) { | 642 | 24 | const StringRef str_ref = | 643 | 24 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 644 | 24 | const StringRef delimiter_ref = | 645 | 24 | delimiter_column.get_data_at(index_check_const<delimiter_const>(i)); | 646 | | | 647 | 24 | if (str_ref.size == 0) { | 648 | 16 | dest_offsets.push_back(dest_pos); | 649 | 16 | continue; | 650 | 16 | } | 651 | 8 | if (delimiter_ref.size == 0) { | 652 | 8 | split_empty_delimiter(str_ref, column_string_chars, column_string_offsets, | 653 | 8 | string_pos, dest_pos); | 654 | 8 | } else { | 655 | 0 | if constexpr (!delimiter_const) { | 656 | 0 | search.set_pattern(&delimiter_ref); | 657 | 0 | } | 658 | 0 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 659 | 0 | const size_t str_offset = str_pos; | 660 | 0 | const size_t old_size = column_string_chars.size(); | 661 | | // search first match delimter_ref index from src string among str_offset to end | 662 | 0 | const char* result_start = | 663 | 0 | search.search(str_ref.data + str_offset, str_ref.size - str_offset); | 664 | | // compute split part size | 665 | 0 | const size_t split_part_size = result_start - str_ref.data - str_offset; | 666 | | // save dist string split part | 667 | 0 | if (split_part_size > 0) { | 668 | 0 | const size_t new_size = old_size + split_part_size; | 669 | 0 | column_string_chars.resize(new_size); | 670 | 0 | memcpy_small_allow_read_write_overflow15( | 671 | 0 | column_string_chars.data() + old_size, str_ref.data + str_offset, | 672 | 0 | split_part_size); | 673 | | // add dist string offset | 674 | 0 | string_pos += split_part_size; | 675 | 0 | } | 676 | 0 | column_string_offsets.push_back(string_pos); | 677 | | // array offset + 1 | 678 | 0 | dest_pos++; | 679 | | // add src string str_pos to next search start | 680 | 0 | str_pos += split_part_size + delimiter_ref.size; | 681 | 0 | } | 682 | 0 | } | 683 | 8 | dest_offsets.push_back(dest_pos); | 684 | 8 | } | 685 | 8 | } |
Unexecuted instantiation: _ZNK5doris21FunctionSplitByString8_executeILb1ELb1EEEvRKNS_9ColumnStrIjEES5_RNS_7IColumnERNS_8PODArrayImLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm |
686 | | |
687 | | void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars& column_string_chars, |
688 | | ColumnString::Offsets& column_string_offsets, |
689 | | ColumnArray::Offset64& string_pos, |
690 | 27 | ColumnArray::Offset64& dest_pos) const { |
691 | 27 | const size_t old_size = column_string_chars.size(); |
692 | 27 | const size_t new_size = old_size + str_ref.size; |
693 | 27 | column_string_chars.resize(new_size); |
694 | 27 | memcpy(column_string_chars.data() + old_size, str_ref.data, str_ref.size); |
695 | 27 | if (simd::VStringFunctions::is_ascii(str_ref)) { |
696 | 24 | const auto size = str_ref.size; |
697 | | |
698 | 24 | const auto nested_old_size = column_string_offsets.size(); |
699 | 24 | const auto nested_new_size = nested_old_size + size; |
700 | 24 | column_string_offsets.resize(nested_new_size); |
701 | 24 | std::iota(column_string_offsets.data() + nested_old_size, |
702 | 24 | column_string_offsets.data() + nested_new_size, string_pos + 1); |
703 | | |
704 | 24 | string_pos += size; |
705 | 24 | dest_pos += size; |
706 | | // The above code is equivalent to the code in the following comment. |
707 | | // for (size_t i = 0; i < str_ref.size; i++) { |
708 | | // string_pos++; |
709 | | // column_string_offsets.push_back(string_pos); |
710 | | // (*dest_nested_null_map).push_back(false); |
711 | | // dest_pos++; |
712 | | // } |
713 | 24 | } else { |
714 | 22 | for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += utf8_char_len) { |
715 | 19 | utf8_char_len = UTF8_BYTE_LENGTH[(unsigned char)str_ref.data[i]]; |
716 | | |
717 | 19 | string_pos += utf8_char_len; |
718 | 19 | column_string_offsets.push_back(string_pos); |
719 | 19 | dest_pos++; |
720 | 19 | } |
721 | 3 | } |
722 | 27 | } |
723 | | }; |
724 | | |
725 | | enum class FunctionCountSubStringType { TWO_ARGUMENTS, THREE_ARGUMENTS }; |
726 | | |
727 | | template <FunctionCountSubStringType type> |
728 | | class FunctionCountSubString : public IFunction { |
729 | | public: |
730 | | static constexpr auto name = "count_substrings"; |
731 | | static constexpr auto arg_count = (type == FunctionCountSubStringType::TWO_ARGUMENTS) ? 2 : 3; |
732 | | |
733 | 283 | static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); }_ZN5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE6createEv Line | Count | Source | 733 | 77 | static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); } |
_ZN5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE6createEv Line | Count | Source | 733 | 206 | static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); } |
|
734 | | using NullMapType = PaddedPODArray<UInt8>; |
735 | | |
736 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8get_nameB5cxx11Ev |
737 | | |
738 | 0 | size_t get_number_of_arguments() const override { return arg_count; }Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE23get_number_of_argumentsEv |
739 | | |
740 | 265 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
741 | 265 | return std::make_shared<DataTypeInt32>(); |
742 | 265 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 740 | 68 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 741 | 68 | return std::make_shared<DataTypeInt32>(); | 742 | 68 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 740 | 197 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 741 | 197 | return std::make_shared<DataTypeInt32>(); | 742 | 197 | } |
|
743 | | |
744 | 16 | DataTypes get_variadic_argument_types_impl() const override { |
745 | 16 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { |
746 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; |
747 | 8 | } else { |
748 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
749 | 8 | std::make_shared<DataTypeInt32>()}; |
750 | 8 | } |
751 | 16 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE32get_variadic_argument_types_implEv Line | Count | Source | 744 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 745 | 8 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 746 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; | 747 | | } else { | 748 | | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), | 749 | | std::make_shared<DataTypeInt32>()}; | 750 | | } | 751 | 8 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE32get_variadic_argument_types_implEv Line | Count | Source | 744 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 745 | | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 746 | | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; | 747 | 8 | } else { | 748 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), | 749 | 8 | std::make_shared<DataTypeInt32>()}; | 750 | 8 | } | 751 | 8 | } |
|
752 | | |
753 | 267 | bool is_variadic() const override { return true; }_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE11is_variadicEv Line | Count | Source | 753 | 69 | bool is_variadic() const override { return true; } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE11is_variadicEv Line | Count | Source | 753 | 198 | bool is_variadic() const override { return true; } |
|
754 | | |
755 | | Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, |
756 | 234 | uint32_t result, size_t input_rows_count) const override { |
757 | 234 | DCHECK(arg_count); |
758 | 234 | bool col_const[arg_count]; |
759 | 234 | ColumnPtr argument_columns[arg_count]; |
760 | 878 | for (int i = 0; i < arg_count; ++i) { |
761 | 644 | std::tie(argument_columns[i], col_const[i]) = |
762 | 644 | unpack_if_const(block.get_by_position(arguments[i]).column); |
763 | 644 | } |
764 | | |
765 | 234 | auto dest_column_ptr = ColumnInt32::create(input_rows_count); |
766 | 234 | auto& dest_column_data = dest_column_ptr->get_data(); |
767 | | |
768 | 234 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { |
769 | 58 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); |
770 | 58 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); |
771 | 58 | std::visit( |
772 | 58 | [&](auto str_const, auto pattern_const) { |
773 | 58 | _execute<str_const, pattern_const>(src_column_string, pattern_column, |
774 | 58 | dest_column_data, input_rows_count); |
775 | 58 | }, _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESH_EEDaSC_SD_ Line | Count | Source | 772 | 32 | [&](auto str_const, auto pattern_const) { | 773 | 32 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 774 | 32 | dest_column_data, input_rows_count); | 775 | 32 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb0EESG_IbLb1EEEEDaSC_SD_ Line | Count | Source | 772 | 13 | [&](auto str_const, auto pattern_const) { | 773 | 13 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 774 | 13 | dest_column_data, input_rows_count); | 775 | 13 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESG_IbLb0EEEEDaSC_SD_ Line | Count | Source | 772 | 13 | [&](auto str_const, auto pattern_const) { | 773 | 13 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 774 | 13 | dest_column_data, input_rows_count); | 775 | 13 | }, |
Unexecuted instantiation: _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_E_clISt17integral_constantIbLb1EESH_EEDaSC_SD_ |
776 | 58 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1])); |
777 | 176 | } else { |
778 | 176 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); |
779 | 176 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); |
780 | 176 | const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]); |
781 | 176 | std::visit( |
782 | 176 | [&](auto str_const, auto pattern_const, auto start_pos_const) { |
783 | 176 | _execute<str_const, pattern_const, start_pos_const>( |
784 | 176 | src_column_string, pattern_column, start_pos_column, |
785 | 176 | dest_column_data, input_rows_count); |
786 | 176 | }, _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESI_SI_EEDaSC_SD_SE_ Line | Count | Source | 782 | 36 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 36 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 36 | src_column_string, pattern_column, start_pos_column, | 785 | 36 | dest_column_data, input_rows_count); | 786 | 36 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESI_SH_IbLb1EEEEDaSC_SD_SE_ Line | Count | Source | 782 | 29 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 29 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 29 | src_column_string, pattern_column, start_pos_column, | 785 | 29 | dest_column_data, input_rows_count); | 786 | 29 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESH_IbLb1EESI_EEDaSC_SD_SE_ Line | Count | Source | 782 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 22 | src_column_string, pattern_column, start_pos_column, | 785 | 22 | dest_column_data, input_rows_count); | 786 | 22 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb0EESH_IbLb1EESJ_EEDaSC_SD_SE_ Line | Count | Source | 782 | 23 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 23 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 23 | src_column_string, pattern_column, start_pos_column, | 785 | 23 | dest_column_data, input_rows_count); | 786 | 23 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESH_IbLb0EESJ_EEDaSC_SD_SE_ Line | Count | Source | 782 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 22 | src_column_string, pattern_column, start_pos_column, | 785 | 22 | dest_column_data, input_rows_count); | 786 | 22 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESH_IbLb0EESI_EEDaSC_SD_SE_ Line | Count | Source | 782 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 22 | src_column_string, pattern_column, start_pos_column, | 785 | 22 | dest_column_data, input_rows_count); | 786 | 22 | }, |
_ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESI_SH_IbLb0EEEEDaSC_SD_SE_ Line | Count | Source | 782 | 22 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 22 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 22 | src_column_string, pattern_column, start_pos_column, | 785 | 22 | dest_column_data, input_rows_count); | 786 | 22 | }, |
Unexecuted instantiation: _ZZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjmENKUlT_T0_T1_E_clISt17integral_constantIbLb1EESI_SI_EEDaSC_SD_SE_ |
787 | 176 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1]), |
788 | 176 | make_bool_variant(col_const[2])); |
789 | 176 | } |
790 | | |
791 | 234 | block.replace_by_position(result, std::move(dest_column_ptr)); |
792 | 234 | return Status::OK(); |
793 | 234 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 756 | 58 | uint32_t result, size_t input_rows_count) const override { | 757 | 58 | DCHECK(arg_count); | 758 | 58 | bool col_const[arg_count]; | 759 | 58 | ColumnPtr argument_columns[arg_count]; | 760 | 174 | for (int i = 0; i < arg_count; ++i) { | 761 | 116 | std::tie(argument_columns[i], col_const[i]) = | 762 | 116 | unpack_if_const(block.get_by_position(arguments[i]).column); | 763 | 116 | } | 764 | | | 765 | 58 | auto dest_column_ptr = ColumnInt32::create(input_rows_count); | 766 | 58 | auto& dest_column_data = dest_column_ptr->get_data(); | 767 | | | 768 | 58 | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 769 | 58 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 770 | 58 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 771 | 58 | std::visit( | 772 | 58 | [&](auto str_const, auto pattern_const) { | 773 | 58 | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 774 | 58 | dest_column_data, input_rows_count); | 775 | 58 | }, | 776 | 58 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1])); | 777 | | } else { | 778 | | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 779 | | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 780 | | const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]); | 781 | | std::visit( | 782 | | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | | _execute<str_const, pattern_const, start_pos_const>( | 784 | | src_column_string, pattern_column, start_pos_column, | 785 | | dest_column_data, input_rows_count); | 786 | | }, | 787 | | make_bool_variant(col_const[0]), make_bool_variant(col_const[1]), | 788 | | make_bool_variant(col_const[2])); | 789 | | } | 790 | | | 791 | 58 | block.replace_by_position(result, std::move(dest_column_ptr)); | 792 | 58 | return Status::OK(); | 793 | 58 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 756 | 176 | uint32_t result, size_t input_rows_count) const override { | 757 | 176 | DCHECK(arg_count); | 758 | 176 | bool col_const[arg_count]; | 759 | 176 | ColumnPtr argument_columns[arg_count]; | 760 | 704 | for (int i = 0; i < arg_count; ++i) { | 761 | 528 | std::tie(argument_columns[i], col_const[i]) = | 762 | 528 | unpack_if_const(block.get_by_position(arguments[i]).column); | 763 | 528 | } | 764 | | | 765 | 176 | auto dest_column_ptr = ColumnInt32::create(input_rows_count); | 766 | 176 | auto& dest_column_data = dest_column_ptr->get_data(); | 767 | | | 768 | | if constexpr (type == FunctionCountSubStringType::TWO_ARGUMENTS) { | 769 | | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 770 | | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 771 | | std::visit( | 772 | | [&](auto str_const, auto pattern_const) { | 773 | | _execute<str_const, pattern_const>(src_column_string, pattern_column, | 774 | | dest_column_data, input_rows_count); | 775 | | }, | 776 | | make_bool_variant(col_const[0]), make_bool_variant(col_const[1])); | 777 | 176 | } else { | 778 | 176 | const auto& src_column_string = assert_cast<const ColumnString&>(*argument_columns[0]); | 779 | 176 | const auto& pattern_column = assert_cast<const ColumnString&>(*argument_columns[1]); | 780 | 176 | const auto& start_pos_column = assert_cast<const ColumnInt32&>(*argument_columns[2]); | 781 | 176 | std::visit( | 782 | 176 | [&](auto str_const, auto pattern_const, auto start_pos_const) { | 783 | 176 | _execute<str_const, pattern_const, start_pos_const>( | 784 | 176 | src_column_string, pattern_column, start_pos_column, | 785 | 176 | dest_column_data, input_rows_count); | 786 | 176 | }, | 787 | 176 | make_bool_variant(col_const[0]), make_bool_variant(col_const[1]), | 788 | 176 | make_bool_variant(col_const[2])); | 789 | 176 | } | 790 | | | 791 | 176 | block.replace_by_position(result, std::move(dest_column_ptr)); | 792 | 176 | return Status::OK(); | 793 | 176 | } |
|
794 | | |
795 | | private: |
796 | | template <bool src_const, bool pattern_const> |
797 | | void _execute(const ColumnString& src_column_string, const ColumnString& pattern_column, |
798 | 58 | ColumnInt32::Container& dest_column_data, size_t size) const { |
799 | 241 | for (size_t i = 0; i < size; i++) { |
800 | 183 | const StringRef str_ref = |
801 | 183 | src_column_string.get_data_at(index_check_const<src_const>(i)); |
802 | | |
803 | 183 | const StringRef pattern_ref = |
804 | 183 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); |
805 | 183 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); |
806 | 183 | } |
807 | 58 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb0ELb0EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 798 | 32 | ColumnInt32::Container& dest_column_data, size_t size) const { | 799 | 133 | for (size_t i = 0; i < size; i++) { | 800 | 101 | const StringRef str_ref = | 801 | 101 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 802 | | | 803 | 101 | const StringRef pattern_ref = | 804 | 101 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 805 | 101 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); | 806 | 101 | } | 807 | 32 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb0ELb1EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 798 | 13 | ColumnInt32::Container& dest_column_data, size_t size) const { | 799 | 54 | for (size_t i = 0; i < size; i++) { | 800 | 41 | const StringRef str_ref = | 801 | 41 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 802 | | | 803 | 41 | const StringRef pattern_ref = | 804 | 41 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 805 | 41 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); | 806 | 41 | } | 807 | 13 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb1ELb0EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 798 | 13 | ColumnInt32::Container& dest_column_data, size_t size) const { | 799 | 54 | for (size_t i = 0; i < size; i++) { | 800 | 41 | const StringRef str_ref = | 801 | 41 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 802 | | | 803 | 41 | const StringRef pattern_ref = | 804 | 41 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 805 | 41 | dest_column_data[i] = find_str_count(str_ref, pattern_ref); | 806 | 41 | } | 807 | 13 | } |
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8_executeILb1ELb1EEEvRKNS_9ColumnStrIjEES7_RNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm |
808 | | |
809 | | template <bool src_const, bool pattern_const, bool start_pos_const> |
810 | | void _execute(const ColumnString& src_column_string, const ColumnString& pattern_column, |
811 | | const ColumnInt32& start_pos_column, ColumnInt32::Container& dest_column_data, |
812 | 176 | size_t size) const { |
813 | 411 | for (size_t i = 0; i < size; i++) { |
814 | 235 | const StringRef str_ref = |
815 | 235 | src_column_string.get_data_at(index_check_const<src_const>(i)); |
816 | 235 | const StringRef pattern_ref = |
817 | 235 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); |
818 | | // 1-based index |
819 | 235 | int32_t start_pos = |
820 | 235 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; |
821 | | |
822 | 235 | const char* p = str_ref.begin(); |
823 | 235 | const char* end = str_ref.end(); |
824 | 235 | int char_size = 0; |
825 | 1.47k | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { |
826 | 1.24k | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; |
827 | 1.24k | } |
828 | 235 | const auto start_byte_len = p - str_ref.begin(); |
829 | | |
830 | 235 | if (start_pos < 0 || start_byte_len >= str_ref.size) { |
831 | 134 | dest_column_data[i] = 0; |
832 | 134 | } else { |
833 | 101 | dest_column_data[i] = |
834 | 101 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); |
835 | 101 | } |
836 | 235 | } |
837 | 176 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb0ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 812 | 36 | size_t size) const { | 813 | 97 | for (size_t i = 0; i < size; i++) { | 814 | 61 | const StringRef str_ref = | 815 | 61 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 816 | 61 | const StringRef pattern_ref = | 817 | 61 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 818 | | // 1-based index | 819 | 61 | int32_t start_pos = | 820 | 61 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 821 | | | 822 | 61 | const char* p = str_ref.begin(); | 823 | 61 | const char* end = str_ref.end(); | 824 | 61 | int char_size = 0; | 825 | 456 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 826 | 395 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 827 | 395 | } | 828 | 61 | const auto start_byte_len = p - str_ref.begin(); | 829 | | | 830 | 61 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 831 | 38 | dest_column_data[i] = 0; | 832 | 38 | } else { | 833 | 23 | dest_column_data[i] = | 834 | 23 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 835 | 23 | } | 836 | 61 | } | 837 | 36 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb0ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 812 | 29 | size_t size) const { | 813 | 78 | for (size_t i = 0; i < size; i++) { | 814 | 49 | const StringRef str_ref = | 815 | 49 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 816 | 49 | const StringRef pattern_ref = | 817 | 49 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 818 | | // 1-based index | 819 | 49 | int32_t start_pos = | 820 | 49 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 821 | | | 822 | 49 | const char* p = str_ref.begin(); | 823 | 49 | const char* end = str_ref.end(); | 824 | 49 | int char_size = 0; | 825 | 242 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 826 | 193 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 827 | 193 | } | 828 | 49 | const auto start_byte_len = p - str_ref.begin(); | 829 | | | 830 | 49 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 831 | 22 | dest_column_data[i] = 0; | 832 | 27 | } else { | 833 | 27 | dest_column_data[i] = | 834 | 27 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 835 | 27 | } | 836 | 49 | } | 837 | 29 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb1ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 812 | 22 | size_t size) const { | 813 | 44 | for (size_t i = 0; i < size; i++) { | 814 | 22 | const StringRef str_ref = | 815 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 816 | 22 | const StringRef pattern_ref = | 817 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 818 | | // 1-based index | 819 | 22 | int32_t start_pos = | 820 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 821 | | | 822 | 22 | const char* p = str_ref.begin(); | 823 | 22 | const char* end = str_ref.end(); | 824 | 22 | int char_size = 0; | 825 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 826 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 827 | 129 | } | 828 | 22 | const auto start_byte_len = p - str_ref.begin(); | 829 | | | 830 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 831 | 14 | dest_column_data[i] = 0; | 832 | 14 | } else { | 833 | 8 | dest_column_data[i] = | 834 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 835 | 8 | } | 836 | 22 | } | 837 | 22 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb0ELb1ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 812 | 23 | size_t size) const { | 813 | 60 | for (size_t i = 0; i < size; i++) { | 814 | 37 | const StringRef str_ref = | 815 | 37 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 816 | 37 | const StringRef pattern_ref = | 817 | 37 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 818 | | // 1-based index | 819 | 37 | int32_t start_pos = | 820 | 37 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 821 | | | 822 | 37 | const char* p = str_ref.begin(); | 823 | 37 | const char* end = str_ref.end(); | 824 | 37 | int char_size = 0; | 825 | 177 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 826 | 140 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 827 | 140 | } | 828 | 37 | const auto start_byte_len = p - str_ref.begin(); | 829 | | | 830 | 37 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 831 | 18 | dest_column_data[i] = 0; | 832 | 19 | } else { | 833 | 19 | dest_column_data[i] = | 834 | 19 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 835 | 19 | } | 836 | 37 | } | 837 | 23 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb0ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 812 | 22 | size_t size) const { | 813 | 44 | for (size_t i = 0; i < size; i++) { | 814 | 22 | const StringRef str_ref = | 815 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 816 | 22 | const StringRef pattern_ref = | 817 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 818 | | // 1-based index | 819 | 22 | int32_t start_pos = | 820 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 821 | | | 822 | 22 | const char* p = str_ref.begin(); | 823 | 22 | const char* end = str_ref.end(); | 824 | 22 | int char_size = 0; | 825 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 826 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 827 | 129 | } | 828 | 22 | const auto start_byte_len = p - str_ref.begin(); | 829 | | | 830 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 831 | 14 | dest_column_data[i] = 0; | 832 | 14 | } else { | 833 | 8 | dest_column_data[i] = | 834 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 835 | 8 | } | 836 | 22 | } | 837 | 22 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb0ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 812 | 22 | size_t size) const { | 813 | 44 | for (size_t i = 0; i < size; i++) { | 814 | 22 | const StringRef str_ref = | 815 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 816 | 22 | const StringRef pattern_ref = | 817 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 818 | | // 1-based index | 819 | 22 | int32_t start_pos = | 820 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 821 | | | 822 | 22 | const char* p = str_ref.begin(); | 823 | 22 | const char* end = str_ref.end(); | 824 | 22 | int char_size = 0; | 825 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 826 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 827 | 129 | } | 828 | 22 | const auto start_byte_len = p - str_ref.begin(); | 829 | | | 830 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 831 | 14 | dest_column_data[i] = 0; | 832 | 14 | } else { | 833 | 8 | dest_column_data[i] = | 834 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 835 | 8 | } | 836 | 22 | } | 837 | 22 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb1ELb0EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm Line | Count | Source | 812 | 22 | size_t size) const { | 813 | 44 | for (size_t i = 0; i < size; i++) { | 814 | 22 | const StringRef str_ref = | 815 | 22 | src_column_string.get_data_at(index_check_const<src_const>(i)); | 816 | 22 | const StringRef pattern_ref = | 817 | 22 | pattern_column.get_data_at(index_check_const<pattern_const>(i)); | 818 | | // 1-based index | 819 | 22 | int32_t start_pos = | 820 | 22 | start_pos_column.get_element(index_check_const<start_pos_const>(i)) - 1; | 821 | | | 822 | 22 | const char* p = str_ref.begin(); | 823 | 22 | const char* end = str_ref.end(); | 824 | 22 | int char_size = 0; | 825 | 151 | for (size_t j = 0; j < start_pos && p < end; ++j, p += char_size) { | 826 | 129 | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; | 827 | 129 | } | 828 | 22 | const auto start_byte_len = p - str_ref.begin(); | 829 | | | 830 | 22 | if (start_pos < 0 || start_byte_len >= str_ref.size) { | 831 | 14 | dest_column_data[i] = 0; | 832 | 14 | } else { | 833 | 8 | dest_column_data[i] = | 834 | 8 | find_str_count(str_ref.substring(start_byte_len), pattern_ref); | 835 | 8 | } | 836 | 22 | } | 837 | 22 | } |
Unexecuted instantiation: _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8_executeILb1ELb1ELb1EEEvRKNS_9ColumnStrIjEES7_RKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERNS_8PODArrayIiLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEEm |
838 | | |
839 | 529 | size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const { |
840 | 529 | size_t old_size = pos; |
841 | 529 | size_t str_size = str_ref.size; |
842 | 2.20k | while (pos < str_size && |
843 | 2.20k | memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos, |
844 | 1.98k | (const uint8_t*)pattern_ref.data, pattern_ref.size)) { |
845 | 1.67k | pos++; |
846 | 1.67k | } |
847 | 529 | return pos - old_size; |
848 | 529 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE8find_posEmNS_9StringRefES3_ Line | Count | Source | 839 | 291 | size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const { | 840 | 291 | size_t old_size = pos; | 841 | 291 | size_t str_size = str_ref.size; | 842 | 1.05k | while (pos < str_size && | 843 | 1.05k | memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos, | 844 | 933 | (const uint8_t*)pattern_ref.data, pattern_ref.size)) { | 845 | 763 | pos++; | 846 | 763 | } | 847 | 291 | return pos - old_size; | 848 | 291 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE8find_posEmNS_9StringRefES3_ Line | Count | Source | 839 | 238 | size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const { | 840 | 238 | size_t old_size = pos; | 841 | 238 | size_t str_size = str_ref.size; | 842 | 1.15k | while (pos < str_size && | 843 | 1.15k | memcmp_small_allow_overflow15((const uint8_t*)str_ref.data + pos, | 844 | 1.05k | (const uint8_t*)pattern_ref.data, pattern_ref.size)) { | 845 | 914 | pos++; | 846 | 914 | } | 847 | 238 | return pos - old_size; | 848 | 238 | } |
|
849 | | |
850 | 284 | int find_str_count(const StringRef str_ref, StringRef pattern_ref) const { |
851 | 284 | int count = 0; |
852 | 284 | if (str_ref.size == 0 || pattern_ref.size == 0) { |
853 | 64 | return 0; |
854 | 220 | } else { |
855 | 529 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { |
856 | 529 | const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref); |
857 | 529 | if (res_pos == (str_ref.size - str_pos)) { |
858 | 220 | break; // not find |
859 | 220 | } |
860 | 309 | count++; |
861 | 309 | str_pos = str_pos + res_pos + pattern_ref.size; |
862 | 309 | } |
863 | 220 | } |
864 | 220 | return count; |
865 | 284 | } _ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE0EE14find_str_countENS_9StringRefES3_ Line | Count | Source | 850 | 183 | int find_str_count(const StringRef str_ref, StringRef pattern_ref) const { | 851 | 183 | int count = 0; | 852 | 183 | if (str_ref.size == 0 || pattern_ref.size == 0) { | 853 | 62 | return 0; | 854 | 121 | } else { | 855 | 291 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 856 | 291 | const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref); | 857 | 291 | if (res_pos == (str_ref.size - str_pos)) { | 858 | 121 | break; // not find | 859 | 121 | } | 860 | 170 | count++; | 861 | 170 | str_pos = str_pos + res_pos + pattern_ref.size; | 862 | 170 | } | 863 | 121 | } | 864 | 121 | return count; | 865 | 183 | } |
_ZNK5doris22FunctionCountSubStringILNS_26FunctionCountSubStringTypeE1EE14find_str_countENS_9StringRefES3_ Line | Count | Source | 850 | 101 | int find_str_count(const StringRef str_ref, StringRef pattern_ref) const { | 851 | 101 | int count = 0; | 852 | 101 | if (str_ref.size == 0 || pattern_ref.size == 0) { | 853 | 2 | return 0; | 854 | 99 | } else { | 855 | 238 | for (size_t str_pos = 0; str_pos <= str_ref.size;) { | 856 | 238 | const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref); | 857 | 238 | if (res_pos == (str_ref.size - str_pos)) { | 858 | 99 | break; // not find | 859 | 99 | } | 860 | 139 | count++; | 861 | 139 | str_pos = str_pos + res_pos + pattern_ref.size; | 862 | 139 | } | 863 | 99 | } | 864 | 99 | return count; | 865 | 101 | } |
|
866 | | }; |
867 | | |
868 | 8 | void register_function_string_search(SimpleFunctionFactory& factory) { |
869 | 8 | factory.register_function<FunctionStringLocatePos>(); |
870 | 8 | factory.register_function<FunctionSplitPart>(); |
871 | 8 | factory.register_function<FunctionSplitByString>(); |
872 | 8 | factory.register_function<FunctionCountSubString<FunctionCountSubStringType::TWO_ARGUMENTS>>(); |
873 | 8 | factory.register_function< |
874 | 8 | FunctionCountSubString<FunctionCountSubStringType::THREE_ARGUMENTS>>(); |
875 | 8 | factory.register_function<FunctionSubstringIndex>(); |
876 | | |
877 | 8 | factory.register_alias(FunctionStringLocatePos::name, "position"); |
878 | 8 | } |
879 | | |
880 | | #include "common/compile_check_avoid_end.h" |
881 | | } // namespace doris |