be/src/exprs/function/function_hamming_distance.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <algorithm> |
19 | | #include <vector> |
20 | | |
21 | | #include "common/status.h" |
22 | | #include "core/column/column_nullable.h" |
23 | | #include "core/column/column_string.h" |
24 | | #include "core/data_type/data_type_number.h" |
25 | | #include "core/string_ref.h" |
26 | | #include "exprs/function/simple_function_factory.h" |
27 | | #include "util/simd/vstring_function.h" |
28 | | |
29 | | namespace doris { |
30 | | |
31 | | class FunctionHammingDistance : public IFunction { |
32 | | public: |
33 | | using ResultDataType = DataTypeInt64; |
34 | | using ResultPaddedPODArray = PaddedPODArray<Int64>; |
35 | | using ResultColumnType = ColumnVector<ResultDataType::PType>; |
36 | | |
37 | | static constexpr auto name = "hamming_distance"; |
38 | | |
39 | 2 | static FunctionPtr create() { return std::make_shared<FunctionHammingDistance>(); } |
40 | | |
41 | 1 | String get_name() const override { return name; } |
42 | 0 | size_t get_number_of_arguments() const override { return 2; } |
43 | | |
44 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
45 | 0 | const bool has_nullable = std::ranges::any_of( |
46 | 0 | arguments, [](const DataTypePtr& type) { return type->is_nullable(); }); |
47 | 0 | if (has_nullable) { |
48 | 0 | return make_nullable(std::make_shared<ResultDataType>()); |
49 | 0 | } |
50 | 0 | return std::make_shared<ResultDataType>(); |
51 | 0 | } |
52 | | |
53 | 0 | bool use_default_implementation_for_nulls() const override { return false; } |
54 | | |
55 | | Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, |
56 | 0 | uint32_t result, size_t input_rows_count) const override { |
57 | 0 | const auto& [left_col, left_const] = |
58 | 0 | unpack_if_const(block.get_by_position(arguments[0]).column); |
59 | 0 | const auto& [right_col, right_const] = |
60 | 0 | unpack_if_const(block.get_by_position(arguments[1]).column); |
61 | |
|
62 | 0 | const auto* left_nullable = check_and_get_column<ColumnNullable>(left_col.get()); |
63 | 0 | const auto* right_nullable = check_and_get_column<ColumnNullable>(right_col.get()); |
64 | |
|
65 | 0 | const IColumn* left_nested = |
66 | 0 | left_nullable ? &left_nullable->get_nested_column() : left_col.get(); |
67 | 0 | const IColumn* right_nested = |
68 | 0 | right_nullable ? &right_nullable->get_nested_column() : right_col.get(); |
69 | |
|
70 | 0 | const auto* left_str_col = assert_cast<const ColumnString*>(left_nested); |
71 | 0 | const auto* right_str_col = assert_cast<const ColumnString*>(right_nested); |
72 | |
|
73 | 0 | auto res_col = ResultColumnType::create(input_rows_count); |
74 | 0 | auto& res_data = res_col->get_data(); |
75 | |
|
76 | 0 | const NullMap* left_null_map = |
77 | 0 | left_nullable ? &left_nullable->get_null_map_data() : nullptr; |
78 | 0 | const NullMap* right_null_map = |
79 | 0 | right_nullable ? &right_nullable->get_null_map_data() : nullptr; |
80 | 0 | const bool has_nullable = left_null_map != nullptr || right_null_map != nullptr; |
81 | |
|
82 | 0 | if (!has_nullable) { |
83 | 0 | if (left_const) { |
84 | 0 | RETURN_IF_ERROR(scalar_vector(left_str_col->get_data_at(0).trim_tail_padding_zero(), |
85 | 0 | *right_str_col, res_data)); |
86 | 0 | } else if (right_const) { |
87 | 0 | RETURN_IF_ERROR(vector_scalar( |
88 | 0 | *left_str_col, right_str_col->get_data_at(0).trim_tail_padding_zero(), |
89 | 0 | res_data)); |
90 | 0 | } else { |
91 | 0 | RETURN_IF_ERROR(vector_vector(*left_str_col, *right_str_col, res_data)); |
92 | 0 | } |
93 | 0 | block.replace_by_position(result, std::move(res_col)); |
94 | 0 | return Status::OK(); |
95 | 0 | } |
96 | | |
97 | 0 | auto null_col = ColumnUInt8::create(input_rows_count, 0); |
98 | 0 | auto& null_map = null_col->get_data(); |
99 | 0 | if (left_const) { |
100 | 0 | if (left_null_map && (*left_null_map)[0]) { |
101 | 0 | std::fill(null_map.begin(), null_map.end(), 1); |
102 | 0 | block.replace_by_position( |
103 | 0 | result, ColumnNullable::create(std::move(res_col), std::move(null_col))); |
104 | 0 | return Status::OK(); |
105 | 0 | } |
106 | | |
107 | 0 | const auto left = left_str_col->get_data_at(0).trim_tail_padding_zero(); |
108 | 0 | RETURN_IF_ERROR(scalar_vector_nullable(left, *right_str_col, right_null_map, res_data, |
109 | 0 | null_map)); |
110 | 0 | } else if (right_const) { |
111 | 0 | if (right_null_map && (*right_null_map)[0]) { |
112 | 0 | std::fill(null_map.begin(), null_map.end(), 1); |
113 | 0 | block.replace_by_position( |
114 | 0 | result, ColumnNullable::create(std::move(res_col), std::move(null_col))); |
115 | 0 | return Status::OK(); |
116 | 0 | } |
117 | | |
118 | 0 | RETURN_IF_ERROR(vector_scalar_nullable( |
119 | 0 | *left_str_col, right_str_col->get_data_at(0).trim_tail_padding_zero(), |
120 | 0 | left_null_map, res_data, null_map)); |
121 | 0 | } else { |
122 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
123 | 0 | const bool left_is_null = left_null_map && (*left_null_map)[i]; |
124 | 0 | const bool right_is_null = right_null_map && (*right_null_map)[i]; |
125 | 0 | if (left_is_null || right_is_null) { |
126 | 0 | null_map[i] = 1; |
127 | 0 | res_data[i] = 0; |
128 | 0 | continue; |
129 | 0 | } |
130 | | |
131 | 0 | RETURN_IF_ERROR(hamming_distance( |
132 | 0 | left_str_col->get_data_at(i).trim_tail_padding_zero(), |
133 | 0 | right_str_col->get_data_at(i).trim_tail_padding_zero(), res_data[i], i)); |
134 | 0 | } |
135 | 0 | } |
136 | | |
137 | 0 | block.replace_by_position(result, |
138 | 0 | ColumnNullable::create(std::move(res_col), std::move(null_col))); |
139 | 0 | return Status::OK(); |
140 | 0 | } |
141 | | |
142 | | private: |
143 | | static Status vector_vector(const ColumnString& lcol, const ColumnString& rcol, |
144 | 0 | ResultPaddedPODArray& res) { |
145 | 0 | DCHECK_EQ(lcol.size(), rcol.size()); |
146 | |
|
147 | 0 | const size_t size = lcol.size(); |
148 | 0 | res.resize(size); |
149 | 0 | std::vector<size_t> left_offsets; |
150 | 0 | std::vector<size_t> right_offsets; |
151 | 0 | for (size_t i = 0; i < size; ++i) { |
152 | 0 | const auto left = lcol.get_data_at(i).trim_tail_padding_zero(); |
153 | 0 | const auto right = rcol.get_data_at(i).trim_tail_padding_zero(); |
154 | 0 | RETURN_IF_ERROR(hamming_distance_with_offsets( |
155 | 0 | left, left_offsets, false, simd::VStringFunctions::is_ascii(left), right, |
156 | 0 | right_offsets, false, simd::VStringFunctions::is_ascii(right), res[i], i)); |
157 | 0 | } |
158 | 0 | return Status::OK(); |
159 | 0 | } |
160 | | |
161 | | static Status vector_scalar(const ColumnString& lcol, const StringRef& rdata, |
162 | 0 | ResultPaddedPODArray& res) { |
163 | 0 | const size_t size = lcol.size(); |
164 | 0 | res.resize(size); |
165 | 0 | const bool right_ascii = simd::VStringFunctions::is_ascii(rdata); |
166 | 0 | std::vector<size_t> right_offsets; |
167 | 0 | simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets); |
168 | 0 | std::vector<size_t> left_offsets; |
169 | 0 | for (size_t i = 0; i < size; ++i) { |
170 | 0 | const auto left = lcol.get_data_at(i).trim_tail_padding_zero(); |
171 | 0 | RETURN_IF_ERROR(hamming_distance_with_offsets( |
172 | 0 | left, left_offsets, false, simd::VStringFunctions::is_ascii(left), rdata, |
173 | 0 | right_offsets, true, right_ascii, res[i], i)); |
174 | 0 | } |
175 | 0 | return Status::OK(); |
176 | 0 | } |
177 | | |
178 | | static Status scalar_vector(const StringRef& ldata, const ColumnString& rcol, |
179 | 0 | ResultPaddedPODArray& res) { |
180 | 0 | const size_t size = rcol.size(); |
181 | 0 | res.resize(size); |
182 | 0 | const bool left_ascii = simd::VStringFunctions::is_ascii(ldata); |
183 | 0 | std::vector<size_t> left_offsets; |
184 | 0 | simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets); |
185 | 0 | std::vector<size_t> right_offsets; |
186 | 0 | for (size_t i = 0; i < size; ++i) { |
187 | 0 | const auto right = rcol.get_data_at(i).trim_tail_padding_zero(); |
188 | 0 | RETURN_IF_ERROR(hamming_distance_with_offsets( |
189 | 0 | ldata, left_offsets, true, left_ascii, right, right_offsets, false, |
190 | 0 | simd::VStringFunctions::is_ascii(right), res[i], i)); |
191 | 0 | } |
192 | 0 | return Status::OK(); |
193 | 0 | } |
194 | | |
195 | | static Status vector_scalar_nullable(const ColumnString& lcol, const StringRef& rdata, |
196 | | const NullMap* left_null_map, ResultPaddedPODArray& res, |
197 | 0 | NullMap& null_map) { |
198 | 0 | const size_t size = lcol.size(); |
199 | 0 | res.resize(size); |
200 | 0 | const bool right_ascii = simd::VStringFunctions::is_ascii(rdata); |
201 | 0 | std::vector<size_t> right_offsets; |
202 | 0 | simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets); |
203 | 0 | std::vector<size_t> left_offsets; |
204 | 0 | for (size_t i = 0; i < size; ++i) { |
205 | 0 | if (left_null_map && (*left_null_map)[i]) { |
206 | 0 | null_map[i] = 1; |
207 | 0 | res[i] = 0; |
208 | 0 | continue; |
209 | 0 | } |
210 | | |
211 | 0 | const auto left = lcol.get_data_at(i).trim_tail_padding_zero(); |
212 | 0 | RETURN_IF_ERROR(hamming_distance_with_offsets( |
213 | 0 | left, left_offsets, false, simd::VStringFunctions::is_ascii(left), rdata, |
214 | 0 | right_offsets, true, right_ascii, res[i], i)); |
215 | 0 | } |
216 | 0 | return Status::OK(); |
217 | 0 | } |
218 | | |
219 | | static Status scalar_vector_nullable(const StringRef& ldata, const ColumnString& rcol, |
220 | | const NullMap* right_null_map, ResultPaddedPODArray& res, |
221 | 0 | NullMap& null_map) { |
222 | 0 | const size_t size = rcol.size(); |
223 | 0 | res.resize(size); |
224 | 0 | const bool left_ascii = simd::VStringFunctions::is_ascii(ldata); |
225 | 0 | std::vector<size_t> left_offsets; |
226 | 0 | simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets); |
227 | 0 | std::vector<size_t> right_offsets; |
228 | 0 | for (size_t i = 0; i < size; ++i) { |
229 | 0 | if (right_null_map && (*right_null_map)[i]) { |
230 | 0 | null_map[i] = 1; |
231 | 0 | res[i] = 0; |
232 | 0 | continue; |
233 | 0 | } |
234 | | |
235 | 0 | const auto right = rcol.get_data_at(i).trim_tail_padding_zero(); |
236 | 0 | RETURN_IF_ERROR(hamming_distance_with_offsets( |
237 | 0 | ldata, left_offsets, true, left_ascii, right, right_offsets, false, |
238 | 0 | simd::VStringFunctions::is_ascii(right), res[i], i)); |
239 | 0 | } |
240 | 0 | return Status::OK(); |
241 | 0 | } |
242 | | |
243 | | static Status hamming_distance_ascii(const StringRef& left, const StringRef& right, |
244 | 0 | Int64& result, size_t row) { |
245 | 0 | if (left.size != right.size) { |
246 | 0 | return Status::InvalidArgument( |
247 | 0 | "hamming_distance requires strings of the same length at row {}", row); |
248 | 0 | } |
249 | | |
250 | 0 | Int64 distance = 0; |
251 | 0 | for (size_t i = 0; i < left.size; ++i) { |
252 | 0 | distance += static_cast<Int64>(left.data[i] != right.data[i]); |
253 | 0 | } |
254 | 0 | result = distance; |
255 | 0 | return Status::OK(); |
256 | 0 | } |
257 | | |
258 | | static Status hamming_distance_utf8(const StringRef& left, |
259 | | const std::vector<size_t>& left_offsets, |
260 | | const StringRef& right, |
261 | | const std::vector<size_t>& right_offsets, Int64& result, |
262 | 0 | size_t row) { |
263 | 0 | if (left_offsets.size() != right_offsets.size()) { |
264 | 0 | return Status::InvalidArgument( |
265 | 0 | "hamming_distance requires strings of the same length at row {}", row); |
266 | 0 | } |
267 | | |
268 | 0 | Int64 distance = 0; |
269 | 0 | const size_t len = left_offsets.size(); |
270 | 0 | for (size_t i = 0; i + 1 < len; ++i) { |
271 | 0 | const size_t left_off = left_offsets[i]; |
272 | 0 | const size_t left_next = left_offsets[i + 1]; |
273 | 0 | const size_t right_off = right_offsets[i]; |
274 | 0 | const size_t right_next = right_offsets[i + 1]; |
275 | 0 | distance += static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal( |
276 | 0 | left, left_off, left_next, right, right_off, right_next)); |
277 | 0 | } |
278 | 0 | if (len > 0) { |
279 | 0 | const size_t left_off = left_offsets[len - 1]; |
280 | 0 | const size_t right_off = right_offsets[len - 1]; |
281 | 0 | distance += static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal( |
282 | 0 | left, left_off, left.size, right, right_off, right.size)); |
283 | 0 | } |
284 | |
|
285 | 0 | result = distance; |
286 | 0 | return Status::OK(); |
287 | 0 | } |
288 | | |
289 | | static Status hamming_distance_with_offsets( |
290 | | const StringRef& left, std::vector<size_t>& left_offsets, bool left_offsets_ready, |
291 | | bool left_ascii, const StringRef& right, std::vector<size_t>& right_offsets, |
292 | 0 | bool right_offsets_ready, bool right_ascii, Int64& result, size_t row) { |
293 | 0 | if (left_ascii && right_ascii) { |
294 | 0 | return hamming_distance_ascii(left, right, result, row); |
295 | 0 | } |
296 | | |
297 | 0 | if (!left_offsets_ready) { |
298 | 0 | simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets); |
299 | 0 | } |
300 | 0 | if (!right_offsets_ready) { |
301 | 0 | simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets); |
302 | 0 | } |
303 | 0 | return hamming_distance_utf8(left, left_offsets, right, right_offsets, result, row); |
304 | 0 | } |
305 | | |
306 | | static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result, |
307 | 0 | size_t row) { |
308 | 0 | std::vector<size_t> left_offsets; |
309 | 0 | std::vector<size_t> right_offsets; |
310 | 0 | return hamming_distance_with_offsets( |
311 | 0 | left, left_offsets, false, simd::VStringFunctions::is_ascii(left), right, |
312 | 0 | right_offsets, false, simd::VStringFunctions::is_ascii(right), result, row); |
313 | 0 | } |
314 | | }; |
315 | | |
316 | 1 | void register_function_hamming_distance(SimpleFunctionFactory& factory) { |
317 | 1 | factory.register_function<FunctionHammingDistance>(); |
318 | 1 | } |
319 | | |
320 | | } // namespace doris |