be/src/exprs/function/function_string_mask.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <fmt/format.h> |
19 | | |
20 | | #include <algorithm> |
21 | | #include <cstddef> |
22 | | |
23 | | #include "common/status.h" |
24 | | #include "core/assert_cast.h" |
25 | | #include "core/block/block.h" |
26 | | #include "core/block/column_numbers.h" |
27 | | #include "core/column/column_const.h" |
28 | | #include "core/column/column_string.h" |
29 | | #include "core/column/column_vector.h" |
30 | | #include "core/data_type/data_type_string.h" |
31 | | #include "core/memcpy_small.h" |
32 | | #include "core/string_ref.h" |
33 | | #include "exprs/function/function.h" |
34 | | #include "exprs/function/function_helpers.h" |
35 | | #include "exprs/function/simple_function_factory.h" |
36 | | #include "exprs/function_context.h" |
37 | | |
38 | | namespace doris { |
39 | | #include "common/compile_check_avoid_begin.h" |
40 | | |
41 | | template <bool Reverse> |
42 | | class FunctionMaskPartial; |
43 | | |
44 | | class FunctionMask : public IFunction { |
45 | | public: |
46 | | static constexpr auto name = "mask"; |
47 | | static constexpr unsigned char DEFAULT_UPPER_MASK = 'X'; |
48 | | static constexpr unsigned char DEFAULT_LOWER_MASK = 'x'; |
49 | | static constexpr unsigned char DEFAULT_NUMBER_MASK = 'n'; |
50 | 0 | String get_name() const override { return name; } |
51 | 4 | static FunctionPtr create() { return std::make_shared<FunctionMask>(); } |
52 | | |
53 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
54 | 0 | return std::make_shared<DataTypeString>(); |
55 | 0 | } |
56 | | |
57 | 0 | size_t get_number_of_arguments() const override { return 0; } |
58 | | |
59 | 0 | ColumnNumbers get_arguments_that_are_always_constant() const override { return {1, 2, 3}; } |
60 | | |
61 | 2 | bool is_variadic() const override { return true; } |
62 | | |
63 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
64 | 0 | uint32_t result, size_t input_rows_count) const override { |
65 | 0 | DCHECK_GE(arguments.size(), 1); |
66 | 0 | DCHECK_LE(arguments.size(), 4); |
67 | |
|
68 | 0 | char upper = DEFAULT_UPPER_MASK, lower = DEFAULT_LOWER_MASK, number = DEFAULT_NUMBER_MASK; |
69 | |
|
70 | 0 | auto res = ColumnString::create(); |
71 | 0 | const auto& source_column = |
72 | 0 | assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column); |
73 | |
|
74 | 0 | if (arguments.size() > 1) { |
75 | 0 | const auto& col = *block.get_by_position(arguments[1]).column; |
76 | 0 | auto string_ref = col.get_data_at(0); |
77 | 0 | if (string_ref.size > 0) { |
78 | 0 | upper = *string_ref.data; |
79 | 0 | } |
80 | 0 | } |
81 | |
|
82 | 0 | if (arguments.size() > 2) { |
83 | 0 | const auto& col = *block.get_by_position(arguments[2]).column; |
84 | 0 | auto string_ref = col.get_data_at(0); |
85 | 0 | if (string_ref.size > 0) { |
86 | 0 | lower = *string_ref.data; |
87 | 0 | } |
88 | 0 | } |
89 | |
|
90 | 0 | if (arguments.size() > 3) { |
91 | 0 | const auto& col = *block.get_by_position(arguments[3]).column; |
92 | 0 | auto string_ref = col.get_data_at(0); |
93 | 0 | if (string_ref.size > 0) { |
94 | 0 | number = *string_ref.data; |
95 | 0 | } |
96 | 0 | } |
97 | |
|
98 | 0 | if (arguments.size() > 4) { |
99 | 0 | return Status::InvalidArgument( |
100 | 0 | fmt::format("too many arguments for function {}", get_name())); |
101 | 0 | } |
102 | | |
103 | 0 | vector_mask(source_column, *res, upper, lower, number); |
104 | |
|
105 | 0 | block.get_by_position(result).column = std::move(res); |
106 | |
|
107 | 0 | return Status::OK(); |
108 | 0 | } |
109 | | friend class FunctionMaskPartial<true>; |
110 | | friend class FunctionMaskPartial<false>; |
111 | | |
112 | | private: |
113 | | static void vector_mask(const ColumnString& source, ColumnString& result, const char upper, |
114 | 0 | const char lower, const char number) { |
115 | 0 | result.get_chars().resize(source.get_chars().size()); |
116 | 0 | result.get_offsets().resize(source.get_offsets().size()); |
117 | 0 | memcpy_small_allow_read_write_overflow15( |
118 | 0 | result.get_offsets().data(), source.get_offsets().data(), |
119 | 0 | source.get_offsets().size() * sizeof(ColumnString::Offset)); |
120 | |
|
121 | 0 | const unsigned char* src = source.get_chars().data(); |
122 | 0 | const size_t size = source.get_chars().size(); |
123 | 0 | unsigned char* res = result.get_chars().data(); |
124 | 0 | mask(src, size, upper, lower, number, res); |
125 | 0 | } |
126 | | |
127 | | static void mask(const unsigned char* __restrict src, const size_t size, |
128 | | const unsigned char upper, const unsigned char lower, |
129 | 0 | const unsigned char number, unsigned char* __restrict res) { |
130 | 0 | for (size_t i = 0; i != size; ++i) { |
131 | 0 | auto c = src[i]; |
132 | 0 | if (c >= 'A' && c <= 'Z') { |
133 | 0 | res[i] = upper; |
134 | 0 | } else if (c >= 'a' && c <= 'z') { |
135 | 0 | res[i] = lower; |
136 | 0 | } else if (c >= '0' && c <= '9') { |
137 | 0 | res[i] = number; |
138 | 0 | } else { |
139 | 0 | res[i] = c; |
140 | 0 | } |
141 | 0 | } |
142 | 0 | } |
143 | | }; |
144 | | |
145 | | template <bool Reverse> |
146 | | class FunctionMaskPartial : public IFunction { |
147 | | public: |
148 | | static constexpr auto name = Reverse ? "mask_last_n" : "mask_first_n"; |
149 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb1EE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb0EE8get_nameB5cxx11Ev |
150 | 8 | static FunctionPtr create() { return std::make_shared<FunctionMaskPartial>(); }_ZN5doris19FunctionMaskPartialILb1EE6createEv Line | Count | Source | 150 | 4 | static FunctionPtr create() { return std::make_shared<FunctionMaskPartial>(); } |
_ZN5doris19FunctionMaskPartialILb0EE6createEv Line | Count | Source | 150 | 4 | static FunctionPtr create() { return std::make_shared<FunctionMaskPartial>(); } |
|
151 | | |
152 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
153 | 0 | return std::make_shared<DataTypeString>(); |
154 | 0 | } Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb1EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS6_EE Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb0EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS6_EE |
155 | | |
156 | 0 | size_t get_number_of_arguments() const override { return 0; }Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb1EE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb0EE23get_number_of_argumentsEv |
157 | | |
158 | 4 | bool is_variadic() const override { return true; }_ZNK5doris19FunctionMaskPartialILb1EE11is_variadicEv Line | Count | Source | 158 | 2 | bool is_variadic() const override { return true; } |
_ZNK5doris19FunctionMaskPartialILb0EE11is_variadicEv Line | Count | Source | 158 | 2 | bool is_variadic() const override { return true; } |
|
159 | | |
160 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
161 | 0 | uint32_t result, size_t input_rows_count) const override { |
162 | 0 | auto res = ColumnString::create(); |
163 | 0 | auto col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); |
164 | 0 | const auto& source_column = assert_cast<const ColumnString&>(*col); |
165 | |
|
166 | 0 | if (arguments.size() == 1) { // no 2nd arg, just mask all |
167 | 0 | FunctionMask::vector_mask(source_column, *res, FunctionMask::DEFAULT_UPPER_MASK, |
168 | 0 | FunctionMask::DEFAULT_LOWER_MASK, |
169 | 0 | FunctionMask::DEFAULT_NUMBER_MASK); |
170 | 0 | } else { |
171 | 0 | const auto& [col_2nd, is_const] = |
172 | 0 | unpack_if_const(block.get_by_position(arguments[1]).column); |
173 | |
|
174 | 0 | const auto& col_n = assert_cast<const ColumnInt32&>(*col_2nd); |
175 | |
|
176 | 0 | if (is_const) { |
177 | 0 | RETURN_IF_ERROR(vector<true>(source_column, col_n, *res)); |
178 | 0 | } else { |
179 | 0 | RETURN_IF_ERROR(vector<false>(source_column, col_n, *res)); |
180 | 0 | } |
181 | 0 | } |
182 | | |
183 | 0 | block.get_by_position(result).column = std::move(res); |
184 | |
|
185 | 0 | return Status::OK(); |
186 | 0 | } Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb1EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Unexecuted instantiation: _ZNK5doris19FunctionMaskPartialILb0EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm |
187 | | |
188 | | private: |
189 | | template <bool is_const> |
190 | 0 | static Status vector(const ColumnString& src, const ColumnInt32& col_n, ColumnString& result) { |
191 | 0 | const auto num_rows = src.size(); |
192 | 0 | const auto* chars = src.get_chars().data(); |
193 | 0 | const auto* offsets = src.get_offsets().data(); |
194 | 0 | result.get_chars().resize(src.get_chars().size()); |
195 | 0 | result.get_offsets().resize(src.get_offsets().size()); |
196 | 0 | memcpy_small_allow_read_write_overflow15( |
197 | 0 | result.get_offsets().data(), src.get_offsets().data(), |
198 | 0 | src.get_offsets().size() * sizeof(ColumnString::Offset)); |
199 | 0 | auto* res = result.get_chars().data(); |
200 | |
|
201 | 0 | const auto& col_n_data = col_n.get_data(); |
202 | |
|
203 | 0 | for (ssize_t i = 0; i != num_rows; ++i) { |
204 | 0 | auto offset = offsets[i - 1]; |
205 | 0 | int len = offsets[i] - offset; |
206 | 0 | const int n = col_n_data[index_check_const<is_const>(i)]; |
207 | |
|
208 | 0 | if (n < 0) [[unlikely]] { |
209 | 0 | return Status::InvalidArgument( |
210 | 0 | "function {} only accept non-negative input for 2nd argument but got {}", |
211 | 0 | name, n); |
212 | 0 | } |
213 | | |
214 | 0 | if constexpr (Reverse) { |
215 | 0 | auto start = std::max(len - n, 0); |
216 | 0 | if (start > 0) { |
217 | 0 | memcpy(&res[offset], &chars[offset], start); |
218 | 0 | } |
219 | 0 | offset += start; |
220 | 0 | } else { |
221 | 0 | if (n < len) { |
222 | 0 | memcpy(&res[offset + n], &chars[offset + n], len - n); |
223 | 0 | } |
224 | 0 | } |
225 | |
|
226 | 0 | len = std::min(n, len); |
227 | 0 | FunctionMask::mask(&chars[offset], len, FunctionMask::DEFAULT_UPPER_MASK, |
228 | 0 | FunctionMask::DEFAULT_LOWER_MASK, FunctionMask::DEFAULT_NUMBER_MASK, |
229 | 0 | &res[offset]); |
230 | 0 | } |
231 | | |
232 | 0 | return Status::OK(); |
233 | 0 | } Unexecuted instantiation: _ZN5doris19FunctionMaskPartialILb1EE6vectorILb1EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_ Unexecuted instantiation: _ZN5doris19FunctionMaskPartialILb1EE6vectorILb0EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_ Unexecuted instantiation: _ZN5doris19FunctionMaskPartialILb0EE6vectorILb1EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_ Unexecuted instantiation: _ZN5doris19FunctionMaskPartialILb0EE6vectorILb0EEENS_6StatusERKNS_9ColumnStrIjEERKNS_12ColumnVectorILNS_13PrimitiveTypeE5EEERS5_ |
234 | | }; |
235 | | |
236 | 2 | void register_function_string_mask(SimpleFunctionFactory& factory) { |
237 | 2 | factory.register_function<FunctionMask>(); |
238 | 2 | factory.register_function<FunctionMaskPartial<true>>(); |
239 | 2 | factory.register_function<FunctionMaskPartial<false>>(); |
240 | 2 | } |
241 | | |
242 | | #include "common/compile_check_avoid_end.h" |
243 | | } // namespace doris |