be/src/exprs/function/function_string_misc.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <crc32c/crc32c.h> |
19 | | #include <fmt/format.h> |
20 | | #include <glog/logging.h> |
21 | | #include <unicode/normalizer2.h> |
22 | | #include <unicode/stringpiece.h> |
23 | | #include <unicode/unistr.h> |
24 | | |
25 | | #include <algorithm> |
26 | | #include <bit> |
27 | | #include <boost/locale.hpp> |
28 | | #include <climits> |
29 | | #include <cstddef> |
30 | | #include <cstdint> |
31 | | #include <cstdlib> |
32 | | #include <cstring> |
33 | | #include <format> |
34 | | #include <iomanip> |
35 | | #include <memory> |
36 | | #include <random> |
37 | | #include <sstream> |
38 | | #include <string> |
39 | | #include <string_view> |
40 | | #include <unordered_map> |
41 | | #include <utility> |
42 | | #include <vector> |
43 | | |
44 | | #include "common/compiler_util.h" |
45 | | #include "common/exception.h" |
46 | | #include "common/status.h" |
47 | | #include "core/assert_cast.h" |
48 | | #include "core/block/block.h" |
49 | | #include "core/block/column_numbers.h" |
50 | | #include "core/block/column_with_type_and_name.h" |
51 | | #include "core/column/column.h" |
52 | | #include "core/column/column_const.h" |
53 | | #include "core/column/column_nullable.h" |
54 | | #include "core/column/column_string.h" |
55 | | #include "core/column/column_vector.h" |
56 | | #include "core/data_type/data_type.h" |
57 | | #include "core/data_type/data_type_nullable.h" |
58 | | #include "core/data_type/data_type_number.h" |
59 | | #include "core/data_type/data_type_string.h" |
60 | | #include "core/data_type/define_primitive_type.h" |
61 | | #include "core/memcpy_small.h" |
62 | | #include "core/pod_array.h" |
63 | | #include "core/string_ref.h" |
64 | | #include "core/types.h" |
65 | | #include "exec/common/hash_table/phmap_fwd_decl.h" |
66 | | #include "exec/common/pinyin.h" |
67 | | #include "exec/common/stringop_substring.h" |
68 | | #include "exec/common/template_helpers.hpp" |
69 | | #include "exprs/function/function.h" |
70 | | #include "exprs/function/function_helpers.h" |
71 | | #include "exprs/function/function_needs_to_handle_null.h" |
72 | | #include "exprs/function_context.h" |
73 | | #include "pugixml.hpp" |
74 | | #include "util/hash_util.hpp" |
75 | | #include "util/raw_value.h" |
76 | | #include "util/simd/vstring_function.h" |
77 | | #include "util/string_util.h" |
78 | | #include "util/utf8_check.h" |
79 | | |
80 | | #ifndef USE_LIBCPP |
81 | | #include <memory_resource> |
82 | | #define PMR std::pmr |
83 | | #else |
84 | | #include <boost/container/pmr/monotonic_buffer_resource.hpp> |
85 | | #include <boost/container/pmr/vector.hpp> |
86 | | #define PMR boost::container::pmr |
87 | | #endif |
88 | | |
89 | | #include "exprs/function/simple_function_factory.h" |
90 | | |
91 | | namespace doris { |
92 | | #include "common/compile_check_avoid_begin.h" |
93 | | |
94 | | class FunctionAutoPartitionName : public IFunction { |
95 | | public: |
96 | | static constexpr auto name = "auto_partition_name"; |
97 | 70 | static FunctionPtr create() { return std::make_shared<FunctionAutoPartitionName>(); } |
98 | 0 | String get_name() const override { return name; } |
99 | 0 | size_t get_number_of_arguments() const override { return 0; } |
100 | 62 | bool is_variadic() const override { return true; } |
101 | 150 | bool use_default_implementation_for_nulls() const override { return false; } |
102 | 61 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
103 | 61 | return std::make_shared<DataTypeString>(); |
104 | 61 | } |
105 | | |
106 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
107 | 89 | uint32_t result, size_t input_rows_count) const override { |
108 | 89 | size_t argument_size = arguments.size(); |
109 | 89 | auto const_null_map = ColumnUInt8::create(input_rows_count, 0); |
110 | 89 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
111 | 89 | std::vector<const ColumnString::Chars*> chars_list(argument_size); |
112 | 89 | std::vector<const ColumnString::Offsets*> offsets_list(argument_size); |
113 | 89 | std::vector<bool> is_const_args(argument_size); |
114 | 89 | std::vector<const ColumnUInt8::Container*> null_list(argument_size); |
115 | 89 | std::vector<ColumnPtr> argument_null_columns(argument_size); |
116 | | |
117 | 89 | std::vector<ColumnPtr> argument_columns(argument_size); |
118 | 350 | for (int i = 0; i < argument_size; ++i) { |
119 | 261 | argument_columns[i] = |
120 | 261 | block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); |
121 | 261 | if (const auto* nullable = |
122 | 261 | check_and_get_column<const ColumnNullable>(*argument_columns[i])) { |
123 | 27 | null_list[i] = &nullable->get_null_map_data(); |
124 | 27 | argument_null_columns[i] = nullable->get_null_map_column_ptr(); |
125 | 27 | argument_columns[i] = nullable->get_nested_column_ptr(); |
126 | 234 | } else { |
127 | 234 | null_list[i] = &const_null_map->get_data(); |
128 | 234 | } |
129 | | |
130 | 261 | const auto& [col, is_const] = |
131 | 261 | unpack_if_const(block.get_by_position(arguments[i]).column); |
132 | | |
133 | 261 | const auto* col_str = assert_cast<const ColumnString*>(argument_columns[i].get()); |
134 | 261 | chars_list[i] = &col_str->get_chars(); |
135 | 261 | offsets_list[i] = &col_str->get_offsets(); |
136 | 261 | is_const_args[i] = is_const; |
137 | 261 | } |
138 | | |
139 | 89 | auto res = ColumnString::create(); |
140 | 89 | auto& res_data = res->get_chars(); |
141 | 89 | auto& res_offset = res->get_offsets(); |
142 | 89 | res_offset.resize(input_rows_count); |
143 | | |
144 | 89 | const char* partition_type = chars_list[0]->raw_data(); |
145 | | // partition type is list|range |
146 | 89 | if (std::strncmp(partition_type, "list", 4) == 0) { |
147 | 43 | return _auto_partition_type_of_list(chars_list, offsets_list, is_const_args, null_list, |
148 | 43 | res_data, res_offset, input_rows_count, |
149 | 43 | argument_size, block, result, res); |
150 | 46 | } else { |
151 | 46 | return _auto_partition_type_of_range(chars_list, offsets_list, is_const_args, res_data, |
152 | 46 | res_offset, input_rows_count, argument_size, block, |
153 | 46 | result, res); |
154 | 46 | } |
155 | 0 | return Status::OK(); |
156 | 89 | } |
157 | | |
158 | | private: |
159 | 62 | std::u16string _string_to_u16string(const std::string& str) const { |
160 | 62 | return boost::locale::conv::utf_to_utf<char16_t>(str); |
161 | 62 | } |
162 | | |
163 | 62 | std::string _string_to_unicode(const std::u16string& s) const { |
164 | 62 | std::string res_s; |
165 | 62 | res_s.reserve(s.size()); |
166 | 62 | if (s.length() > 0 && s[0] == '-') { |
167 | 1 | res_s += '_'; |
168 | 1 | } |
169 | 957 | for (int i = 0; i < s.length(); i++) { |
170 | 895 | char16_t ch = s[i]; |
171 | 895 | if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) { |
172 | 514 | res_s += ch; |
173 | 514 | } else { |
174 | 381 | int unicodeValue = _get_code_point_at(s, i); |
175 | 381 | res_s += fmt::format("{:02x}", static_cast<uint32_t>(unicodeValue)); |
176 | 381 | } |
177 | 895 | } |
178 | 62 | return res_s; |
179 | 62 | } |
180 | | |
181 | 381 | int _get_code_point_at(const std::u16string& str, std::size_t index) const { |
182 | 381 | char16_t first = str[index]; |
183 | | // [0xD800,0xDBFF] is the scope of the first code unit |
184 | 381 | if ((first >= 0xD800 && first <= 0xDBFF) && (index + 1 < str.size())) { |
185 | 0 | char16_t second = str[index + 1]; |
186 | | // [0xDC00,0xDFFF] is the scope of the second code unit |
187 | 0 | if (second >= 0xDC00 && second <= 0xDFFF) { |
188 | 0 | return ((first - 0xD800) << 10) + (second - 0xDC00) + 0x10000; |
189 | 0 | } |
190 | 0 | } |
191 | | |
192 | 381 | return first; |
193 | 381 | } |
194 | | Status _auto_partition_type_of_list(std::vector<const ColumnString::Chars*>& chars_list, |
195 | | std::vector<const ColumnString::Offsets*>& offsets_list, |
196 | | std::vector<bool>& is_const_args, |
197 | | const std::vector<const ColumnUInt8::Container*>& null_list, |
198 | | auto& res_data, auto& res_offset, size_t input_rows_count, |
199 | | size_t argument_size, Block& block, uint32_t result, |
200 | 43 | auto& res) const { |
201 | 43 | int curr_len = 0; |
202 | 86 | for (int row = 0; row < input_rows_count; row++) { |
203 | 43 | std::string res_p; |
204 | 43 | res_p.reserve(argument_size * 5); |
205 | 43 | res_p += 'p'; |
206 | 123 | for (int col = 1; col < argument_size; col++) { |
207 | 80 | const auto& current_offsets = *offsets_list[col]; |
208 | 80 | const auto& current_chars = *chars_list[col]; |
209 | 80 | const auto& current_nullmap = *null_list[col]; |
210 | | |
211 | 80 | if (current_nullmap[row]) { |
212 | 18 | res_p += 'X'; |
213 | 62 | } else { |
214 | 62 | auto idx = index_check_const(row, is_const_args[col]); |
215 | | |
216 | 62 | int size = current_offsets[idx] - current_offsets[idx - 1]; |
217 | 62 | const char* raw_chars = |
218 | 62 | reinterpret_cast<const char*>(¤t_chars[current_offsets[idx - 1]]); |
219 | | // convert string to u16string in order to convert to unicode strings |
220 | 62 | const std::string raw_str(raw_chars, size); |
221 | 62 | auto u16string = _string_to_u16string(raw_str); |
222 | 62 | res_p += _string_to_unicode(u16string) + std::to_string(u16string.size()); |
223 | 62 | } |
224 | 80 | } |
225 | | |
226 | | // check the name of length |
227 | 43 | int len = res_p.size(); |
228 | 43 | if (len > 50) { |
229 | 7 | res_p = std::format("{}_{:08x}", res_p.substr(0, 50), to_hash_code(res_p)); |
230 | 7 | len = res_p.size(); |
231 | 7 | } |
232 | 43 | curr_len += len; |
233 | 43 | res_data.resize(curr_len); |
234 | 43 | memcpy(&res_data[res_offset[row - 1]], res_p.c_str(), len); |
235 | 43 | res_offset[row] = res_offset[row - 1] + len; |
236 | 43 | } |
237 | 43 | block.get_by_position(result).column = std::move(res); |
238 | 43 | return Status::OK(); |
239 | 43 | } |
240 | | |
241 | | size_t _copy_date_str_of_len_to_res_data(auto& res_data, auto& res_offset, |
242 | | std::vector<std::string>& date_str, size_t row, |
243 | 88 | size_t len) const { |
244 | 88 | size_t curr_len = 1; |
245 | 377 | for (int j = 0; j < len; j++) { |
246 | 289 | memcpy(&res_data[res_offset[row - 1]] + curr_len, date_str[j].c_str(), |
247 | 289 | date_str[j].size()); |
248 | 289 | curr_len += date_str[j].size(); |
249 | 289 | } |
250 | 88 | return curr_len; |
251 | 88 | } |
252 | | |
253 | | Status _auto_partition_type_of_range(std::vector<const ColumnString::Chars*>& chars_list, |
254 | | std::vector<const ColumnString::Offsets*>& offsets_list, |
255 | | std::vector<bool>& is_const_args, auto& res_data, |
256 | | auto& res_offset, size_t input_rows_count, |
257 | | size_t argument_size, Block& block, uint32_t result, |
258 | 46 | auto& res) const { |
259 | 46 | const char* range_type = chars_list[1]->raw_data(); |
260 | | |
261 | 46 | res_data.resize(15 * input_rows_count); |
262 | 134 | for (int i = 0; i < input_rows_count; i++) { |
263 | 94 | const auto& current_offsets = *offsets_list[2]; |
264 | 94 | const auto& current_chars = *chars_list[2]; |
265 | | |
266 | 94 | auto idx = index_check_const(i, is_const_args[2]); |
267 | 94 | int size = current_offsets[idx] - current_offsets[idx - 1]; |
268 | 94 | const char* tmp = |
269 | 94 | reinterpret_cast<const char*>(¤t_chars[current_offsets[idx - 1]]); |
270 | 94 | std::string to_split_s(tmp, size); |
271 | | |
272 | | // check the str if it is date|datetime |
273 | 94 | RE2 date_regex(R"(^\d{4}-\d{2}-\d{2}( \d{2}:\d{2}:\d{2})?$)"); |
274 | 94 | if (!RE2::FullMatch(to_split_s, date_regex)) { |
275 | 6 | return Status::InvalidArgument("The range partition only support DATE|DATETIME"); |
276 | 6 | } |
277 | | |
278 | | // split date_str from (yyyy-mm-dd hh:mm:ss) to ([yyyy, mm, dd, hh, mm, ss]) |
279 | 88 | std::vector<std::string> date_str(6); |
280 | 88 | date_str[0] = to_split_s.substr(0, 4); |
281 | 300 | for (int ni = 5, j = 1; ni <= size; ni += 3, j++) { |
282 | 212 | date_str[j] = to_split_s.substr(ni, 2); |
283 | 212 | } |
284 | 88 | int curr_len = 0; |
285 | | |
286 | 88 | res_data[res_offset[i - 1]] = 'p'; |
287 | | // raw => 2022-12-12 11:30:20 |
288 | | // year => 2022 01 01 00 00 00 |
289 | | // month => 2022 12 01 00 00 00 |
290 | | // day => 2022 12 12 00 00 00 |
291 | | // hour => 2022 12 12 11 00 00 |
292 | | // minute => 2022 12 11 30 00 |
293 | | // second => 2022 12 12 12 30 20 |
294 | | |
295 | 88 | if (!strncmp(range_type, "year", 4)) { |
296 | 17 | curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 1); |
297 | 17 | memcpy(&res_data[res_offset[i - 1]] + curr_len, "0101", 4); |
298 | 17 | curr_len += 4; |
299 | 71 | } else if (!strncmp(range_type, "month", 5)) { |
300 | 16 | curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 2); |
301 | 16 | memcpy(&res_data[res_offset[i - 1]] + curr_len, "01", 2); |
302 | 16 | curr_len += 2; |
303 | 55 | } else if (!strncmp(range_type, "day", 3)) { |
304 | 16 | curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 3); |
305 | 39 | } else if (!strncmp(range_type, "hour", 4)) { |
306 | 13 | curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 4); |
307 | 26 | } else if (!strncmp(range_type, "minute", 6)) { |
308 | 13 | curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 5); |
309 | 13 | } else if (!strncmp(range_type, "second", 6)) { |
310 | 13 | curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 6); |
311 | 13 | } |
312 | | |
313 | | // fill in zero |
314 | 88 | int zero = 15 - curr_len; |
315 | 88 | std::fill_n(&res_data[res_offset[i - 1]] + curr_len, zero, '0'); |
316 | 88 | curr_len += zero; |
317 | 88 | res_offset[i] = res_offset[i - 1] + curr_len; |
318 | 88 | } |
319 | 40 | block.get_by_position(result).column = std::move(res); |
320 | 40 | return Status::OK(); |
321 | 46 | } |
322 | | |
323 | 7 | int32_t to_hash_code(const std::string& str) const { |
324 | 7 | uint64_t h = 0; |
325 | 1.37k | for (uint8_t c : str) { |
326 | 1.37k | h = (h * 31U + c) & 0xFFFFFFFFU; |
327 | 1.37k | } |
328 | 7 | return static_cast<int32_t>(h); |
329 | 7 | } |
330 | | }; |
331 | | |
332 | | class FunctionRandomBytes : public IFunction { |
333 | | public: |
334 | | static constexpr auto name = "random_bytes"; |
335 | 14 | static FunctionPtr create() { return std::make_shared<FunctionRandomBytes>(); } |
336 | 1 | String get_name() const override { return name; } |
337 | 5 | size_t get_number_of_arguments() const override { return 1; } |
338 | 6 | bool is_variadic() const override { return false; } |
339 | | |
340 | 5 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
341 | 5 | return std::make_shared<DataTypeString>(); |
342 | 5 | } |
343 | | |
344 | 15 | bool use_default_implementation_for_constants() const final { return false; } |
345 | | |
346 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
347 | 4 | uint32_t result, size_t input_rows_count) const override { |
348 | 4 | auto res = ColumnString::create(); |
349 | 4 | auto& res_offsets = res->get_offsets(); |
350 | 4 | auto& res_chars = res->get_chars(); |
351 | 4 | res_offsets.resize(input_rows_count); |
352 | | |
353 | 4 | auto [arg_col, arg_const] = unpack_if_const(block.get_by_position(arguments[0]).column); |
354 | 4 | const auto* length_col = assert_cast<const ColumnInt32*>(arg_col.get()); |
355 | | |
356 | 4 | if (arg_const) { |
357 | 3 | res_chars.reserve(input_rows_count * (length_col->get_element(0) + 2)); |
358 | 3 | } |
359 | | |
360 | 4 | std::vector<uint8_t, Allocator_<uint8_t>> random_bytes; |
361 | 4 | std::random_device rd; |
362 | 4 | std::mt19937 gen(rd()); |
363 | | |
364 | 4 | std::uniform_int_distribution<unsigned short> distribution(0, 255); |
365 | 19 | for (size_t i = 0; i < input_rows_count; ++i) { |
366 | 16 | size_t index = index_check_const(i, arg_const); |
367 | 16 | if (length_col->get_element(index) < 0) [[unlikely]] { |
368 | 1 | return Status::InvalidArgument("argument {} of function {} at row {} was invalid.", |
369 | 1 | length_col->get_element(index), name, index); |
370 | 1 | } |
371 | 15 | random_bytes.resize(length_col->get_element(index)); |
372 | | |
373 | 117 | for (auto& byte : random_bytes) { |
374 | 117 | byte = distribution(gen) & 0xFF; |
375 | 117 | } |
376 | | |
377 | 15 | std::basic_ostringstream<char, std::char_traits<char>, Allocator_<char>> oss; |
378 | 117 | for (const auto& byte : random_bytes) { |
379 | 117 | oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(byte); |
380 | 117 | } |
381 | | |
382 | 15 | StringOP::push_value_string("0x" + oss.str(), i, res_chars, res_offsets); |
383 | 15 | random_bytes.clear(); |
384 | 15 | } |
385 | | |
386 | 3 | block.get_by_position(result).column = std::move(res); |
387 | | |
388 | 3 | return Status::OK(); |
389 | 4 | } |
390 | | }; |
391 | | |
392 | | class FunctionConvertTo : public IFunction { |
393 | | public: |
394 | | static constexpr auto name = "convert_to"; |
395 | | |
396 | 15 | static FunctionPtr create() { return std::make_shared<FunctionConvertTo>(); } |
397 | | |
398 | 1 | String get_name() const override { return name; } |
399 | | |
400 | 6 | size_t get_number_of_arguments() const override { return 2; } |
401 | | |
402 | 6 | DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override { |
403 | 6 | return std::make_shared<DataTypeString>(); |
404 | 6 | } |
405 | | |
406 | 29 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
407 | 29 | if (scope != FunctionContext::THREAD_LOCAL) { |
408 | 6 | return Status::OK(); |
409 | 6 | } |
410 | 23 | if (!context->is_col_constant(1)) { |
411 | 0 | return Status::InvalidArgument( |
412 | 0 | "character argument to convert function must be constant."); |
413 | 0 | } |
414 | 23 | const auto& character_data = context->get_constant_col(1)->column_ptr->get_data_at(0); |
415 | 23 | if (!iequal(character_data.to_string(), "gbk")) { |
416 | 0 | return Status::RuntimeError( |
417 | 0 | "Illegal second argument column of function convert. now only support " |
418 | 0 | "convert to character set of gbk"); |
419 | 0 | } |
420 | | |
421 | 23 | return Status::OK(); |
422 | 23 | } |
423 | | |
424 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
425 | 14 | uint32_t result, size_t input_rows_count) const override { |
426 | 14 | ColumnPtr argument_column = |
427 | 14 | block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); |
428 | 14 | const ColumnString* str_col = static_cast<const ColumnString*>(argument_column.get()); |
429 | 14 | const auto& str_offset = str_col->get_offsets(); |
430 | 14 | const auto& str_chars = str_col->get_chars(); |
431 | 14 | auto col_res = ColumnString::create(); |
432 | 14 | auto& res_offset = col_res->get_offsets(); |
433 | 14 | auto& res_chars = col_res->get_chars(); |
434 | 14 | res_offset.resize(input_rows_count); |
435 | | // max pinyin size is 6 + 1 (first '~') for utf8 chinese word 3 |
436 | 14 | size_t pinyin_size = (str_chars.size() + 2) / 3 * 7; |
437 | 14 | ColumnString::check_chars_length(pinyin_size, 0); |
438 | 14 | res_chars.resize(pinyin_size); |
439 | | |
440 | 14 | size_t in_len = 0, out_len = 0; |
441 | 49 | for (int i = 0; i < input_rows_count; ++i) { |
442 | 35 | in_len = str_offset[i] - str_offset[i - 1]; |
443 | 35 | const char* in = reinterpret_cast<const char*>(&str_chars[str_offset[i - 1]]); |
444 | 35 | char* out = reinterpret_cast<char*>(&res_chars[res_offset[i - 1]]); |
445 | 35 | _utf8_to_pinyin(in, in_len, out, &out_len); |
446 | 35 | res_offset[i] = res_offset[i - 1] + out_len; |
447 | 35 | } |
448 | 14 | res_chars.resize(res_offset[input_rows_count - 1]); |
449 | 14 | block.replace_by_position(result, std::move(col_res)); |
450 | 14 | return Status::OK(); |
451 | 14 | } |
452 | | |
453 | 35 | void _utf8_to_pinyin(const char* in, size_t in_len, char* out, size_t* out_len) const { |
454 | 225 | auto do_memcpy = [](char*& dest, const char*& from, size_t size) { |
455 | 225 | memcpy_small_allow_read_write_overflow15(dest, from, size); |
456 | 225 | dest += size; |
457 | 225 | from += size; |
458 | 225 | }; |
459 | 35 | auto from = in; |
460 | 35 | auto dest = out; |
461 | | |
462 | 273 | while (from - in < in_len) { |
463 | 238 | auto length = get_utf8_byte_length(*from); |
464 | 238 | if (length != 3) { |
465 | 225 | do_memcpy(dest, from, length); |
466 | 225 | } else { |
467 | | // convert utf8 to unicode code to get pinyin offset |
468 | 13 | if (auto tmp = (((int)(*from & 0x0F)) << 12) | (((int)(*(from + 1) & 0x3F)) << 6) | |
469 | 13 | (*(from + 2) & 0x3F); |
470 | 13 | tmp >= START_UNICODE_OFFSET and tmp < END_UNICODE_OFFSET) { |
471 | 13 | const char* buf = nullptr; |
472 | 13 | if (tmp >= START_UNICODE_OFFSET && tmp < MID_UNICODE_OFFSET) { |
473 | 2 | buf = PINYIN_DICT1 + (tmp - START_UNICODE_OFFSET) * MAX_PINYIN_LEN; |
474 | 11 | } else if (tmp >= MID_UNICODE_OFFSET && tmp < END_UNICODE_OFFSET) { |
475 | 11 | buf = PINYIN_DICT2 + (tmp - MID_UNICODE_OFFSET) * MAX_PINYIN_LEN; |
476 | 11 | } |
477 | | |
478 | 13 | auto end = strchr(buf, ' '); |
479 | | // max len for pinyin is 6 |
480 | 13 | int len = MAX_PINYIN_LEN; |
481 | 13 | if (end != nullptr && end - buf < MAX_PINYIN_LEN) { |
482 | 3 | len = end - buf; |
483 | 3 | } |
484 | | // set first char '~' just make sure all english word lower than chinese word |
485 | 13 | *dest = 126; |
486 | 13 | memcpy(dest + 1, buf, len); |
487 | 13 | dest += (len + 1); |
488 | 13 | from += 3; |
489 | 13 | } else { |
490 | 0 | do_memcpy(dest, from, 3); |
491 | 0 | } |
492 | 13 | } |
493 | 238 | } |
494 | | |
495 | 35 | *out_len = dest - out; |
496 | 35 | } |
497 | | }; |
498 | | // +-----------------------------------+ |
499 | | // | 丝 | |
500 | | // +-----------------------------------+ |
501 | | // 1 row in set, 1 warning (0.00 sec) |
502 | | // mysql> select char(14989469 using utf8); |
503 | | // +---------------------------+ |
504 | | // | char(14989469 using utf8) | |
505 | | // +---------------------------+ |
506 | | // | 丝 | |
507 | | // +---------------------------+ |
508 | | // 1 row in set, 1 warning (0.00 sec) |
509 | | // mysql> select char(0xe5, 0xa4, 0x9a, 0xe7, 0x9d, 0xbf, 0xe4, 0xb8, 0x9d, 68, 111, 114, 105, 115 using utf8); |
510 | | // +---------------------------------------------------------------------------------------------+ |
511 | | // | char(0xe5, 0xa4, 0x9a, 0xe7, 0x9d, 0xbf, 0xe4, 0xb8, 0x9d, 68, 111, 114, 105, 115 using utf8) | |
512 | | // +---------------------------------------------------------------------------------------------+ |
513 | | // | 多睿丝 Doris | |
514 | | // +---------------------------------------------------------------------------------------------+ |
515 | | // mysql> select char(68, 111, 114, 0, 105, null, 115 using utf8); |
516 | | // +--------------------------------------------------+ |
517 | | // | char(68, 111, 114, 0, 105, null, 115 using utf8) | |
518 | | // +--------------------------------------------------+ |
519 | | // | Dor is | |
520 | | // +--------------------------------------------------+ |
521 | | |
522 | | // return null: |
523 | | // mysql> select char(255 using utf8); |
524 | | // +----------------------+ |
525 | | // | char(255 using utf8) | |
526 | | // +----------------------+ |
527 | | // | NULL | |
528 | | // +----------------------+ |
529 | | // 1 row in set, 2 warnings (0.00 sec) |
530 | | // |
531 | | // mysql> show warnings; |
532 | | // +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
533 | | // | Level | Code | Message | |
534 | | // +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
535 | | // | Warning | 3719 | 'utf8' is currently an alias for the character set UTF8MB3, but will be an alias for UTF8MB4 in a future release. Please consider using UTF8MB4 in order to be unambiguous. | |
536 | | // | Warning | 1300 | Invalid utf8mb3 character string: 'FF' | |
537 | | // +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
538 | | // 2 rows in set (0.01 sec) |
539 | | |
540 | | // max int value: |
541 | | // mysql> select char(18446744073709551615); |
542 | | // +--------------------------------------------------------+ |
543 | | // | char(18446744073709551615) | |
544 | | // +--------------------------------------------------------+ |
545 | | // | 0xFFFFFFFF | |
546 | | // +--------------------------------------------------------+ |
547 | | // 1 row in set (0.00 sec) |
548 | | // |
549 | | // mysql> select char(18446744073709551616); |
550 | | // +--------------------------------------------------------+ |
551 | | // | char(18446744073709551616) | |
552 | | // +--------------------------------------------------------+ |
553 | | // | 0xFFFFFFFF | |
554 | | // +--------------------------------------------------------+ |
555 | | // 1 row in set, 1 warning (0.00 sec) |
556 | | // |
557 | | // mysql> show warnings; |
558 | | // +---------+------+-----------------------------------------------------------+ |
559 | | // | Level | Code | Message | |
560 | | // +---------+------+-----------------------------------------------------------+ |
561 | | // | Warning | 1292 | Truncated incorrect DECIMAL value: '18446744073709551616' | |
562 | | // +---------+------+-----------------------------------------------------------+ |
563 | | // 1 row in set (0.00 sec) |
564 | | |
565 | | // table columns: |
566 | | // mysql> select * from t; |
567 | | // +------+------+------+ |
568 | | // | f1 | f2 | f3 | |
569 | | // +------+------+------+ |
570 | | // | 228 | 184 | 157 | |
571 | | // | 228 | 184 | 0 | |
572 | | // | 228 | 184 | 99 | |
573 | | // | 99 | 228 | 184 | |
574 | | // +------+------+------+ |
575 | | // 4 rows in set (0.00 sec) |
576 | | // |
577 | | // mysql> select char(f1, f2, f3 using utf8) from t; |
578 | | // +-----------------------------+ |
579 | | // | char(f1, f2, f3 using utf8) | |
580 | | // +-----------------------------+ |
581 | | // | 丝 | |
582 | | // | | |
583 | | // | | |
584 | | // | c | |
585 | | // +-----------------------------+ |
586 | | // 4 rows in set, 4 warnings (0.00 sec) |
587 | | // |
588 | | // mysql> show warnings; |
589 | | // +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
590 | | // | Level | Code | Message | |
591 | | // +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
592 | | // | Warning | 3719 | 'utf8' is currently an alias for the character set UTF8MB3, but will be an alias for UTF8MB4 in a future release. Please consider using UTF8MB4 in order to be unambiguous. | |
593 | | // | Warning | 1300 | Invalid utf8mb3 character string: 'E4B800' | |
594 | | // | Warning | 1300 | Invalid utf8mb3 character string: 'E4B863' | |
595 | | // | Warning | 1300 | Invalid utf8mb3 character string: 'E4B8' | |
596 | | // +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |
597 | | class FunctionIntToChar : public IFunction { |
598 | | public: |
599 | | static constexpr auto name = "char"; |
600 | 320 | static FunctionPtr create() { return std::make_shared<FunctionIntToChar>(); } |
601 | 0 | String get_name() const override { return name; } |
602 | 0 | size_t get_number_of_arguments() const override { return 0; } |
603 | 312 | bool is_variadic() const override { return true; } |
604 | | |
605 | 311 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
606 | 311 | return make_nullable(std::make_shared<DataTypeString>()); |
607 | 311 | } |
608 | 622 | bool use_default_implementation_for_nulls() const override { return false; } |
609 | | |
610 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
611 | 311 | uint32_t result, size_t input_rows_count) const override { |
612 | 311 | DCHECK_GE(arguments.size(), 2); |
613 | | |
614 | 311 | int argument_size = arguments.size(); |
615 | 311 | std::vector<ColumnPtr> str_columns(argument_size - 1); |
616 | 311 | std::vector<const ColumnString::Offsets*> offsets_list(argument_size - 1); |
617 | 311 | std::vector<const ColumnString::Chars*> chars_list(argument_size - 1); |
618 | | |
619 | | // convert each argument columns to column string and then concat the string columns |
620 | 701 | for (size_t i = 1; i < argument_size; ++i) { |
621 | 390 | if (auto const_column = check_and_get_column<const ColumnConst>( |
622 | 390 | *block.get_by_position(arguments[i]).column)) { |
623 | | // ignore null |
624 | 4 | if (const_column->only_null()) { |
625 | 0 | str_columns[i - 1] = nullptr; |
626 | 4 | } else { |
627 | 4 | auto str_column = ColumnString::create(); |
628 | 4 | auto& chars = str_column->get_chars(); |
629 | 4 | auto& offsets = str_column->get_offsets(); |
630 | 4 | offsets.resize(1); |
631 | 4 | const ColumnInt32* int_column; |
632 | 4 | if (auto* nullable = check_and_get_column<const ColumnNullable>( |
633 | 4 | const_column->get_data_column())) { |
634 | 0 | int_column = assert_cast<const ColumnInt32*>( |
635 | 0 | nullable->get_nested_column_ptr().get()); |
636 | 4 | } else { |
637 | 4 | int_column = |
638 | 4 | assert_cast<const ColumnInt32*>(&const_column->get_data_column()); |
639 | 4 | } |
640 | 4 | int int_val = int_column->get_int(0); |
641 | 4 | integer_to_char_(0, &int_val, chars, offsets); |
642 | 4 | str_columns[i - 1] = |
643 | 4 | ColumnConst::create(std::move(str_column), input_rows_count); |
644 | 4 | } |
645 | 4 | offsets_list[i - 1] = nullptr; |
646 | 4 | chars_list[i - 1] = nullptr; |
647 | 386 | } else { |
648 | 386 | auto str_column = ColumnString::create(); |
649 | 386 | auto& chars = str_column->get_chars(); |
650 | 386 | auto& offsets = str_column->get_offsets(); |
651 | | // data.resize(input_rows_count); |
652 | 386 | offsets.resize(input_rows_count); |
653 | | |
654 | 386 | if (auto nullable = check_and_get_column<const ColumnNullable>( |
655 | 386 | *block.get_by_position(arguments[i]).column)) { |
656 | 23 | const auto* int_data = |
657 | 23 | assert_cast<const ColumnInt32*>(nullable->get_nested_column_ptr().get()) |
658 | 23 | ->get_data() |
659 | 23 | .data(); |
660 | 23 | const auto* null_map_data = nullable->get_null_map_data().data(); |
661 | 148 | for (size_t j = 0; j < input_rows_count; ++j) { |
662 | | // ignore null |
663 | 125 | if (null_map_data[j]) { |
664 | 23 | offsets[j] = offsets[j - 1]; |
665 | 102 | } else { |
666 | 102 | integer_to_char_(j, int_data + j, chars, offsets); |
667 | 102 | } |
668 | 125 | } |
669 | 363 | } else { |
670 | 363 | const auto* int_data = assert_cast<const ColumnInt32*>( |
671 | 363 | block.get_by_position(arguments[i]).column.get()) |
672 | 363 | ->get_data() |
673 | 363 | .data(); |
674 | 770 | for (size_t j = 0; j < input_rows_count; ++j) { |
675 | 407 | integer_to_char_(j, int_data + j, chars, offsets); |
676 | 407 | } |
677 | 363 | } |
678 | 386 | offsets_list[i - 1] = &str_column->get_offsets(); |
679 | 386 | chars_list[i - 1] = &str_column->get_chars(); |
680 | 386 | str_columns[i - 1] = std::move(str_column); |
681 | 386 | } |
682 | 390 | } |
683 | | |
684 | 311 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
685 | 311 | auto res = ColumnString::create(); |
686 | 311 | auto& res_data = res->get_chars(); |
687 | 311 | auto& res_offset = res->get_offsets(); |
688 | | |
689 | 311 | size_t res_reserve_size = 0; |
690 | 701 | for (size_t i = 0; i < argument_size - 1; ++i) { |
691 | 390 | if (!str_columns[i]) { |
692 | 0 | continue; |
693 | 0 | } |
694 | 390 | if (auto const_column = check_and_get_column<const ColumnConst>(*str_columns[i])) { |
695 | 4 | auto str_column = |
696 | 4 | assert_cast<const ColumnString*>(&(const_column->get_data_column())); |
697 | 4 | auto& offsets = str_column->get_offsets(); |
698 | 4 | res_reserve_size += (offsets[0] - offsets[-1]) * input_rows_count; |
699 | 386 | } else { |
700 | 918 | for (size_t j = 0; j < input_rows_count; ++j) { |
701 | 532 | size_t append = (*offsets_list[i])[j] - (*offsets_list[i])[j - 1]; |
702 | | // check whether the output might overflow(unlikely) |
703 | 532 | if (UNLIKELY(UINT_MAX - append < res_reserve_size)) { |
704 | 0 | return Status::BufferAllocFailed( |
705 | 0 | "function char output is too large to allocate"); |
706 | 0 | } |
707 | 532 | res_reserve_size += append; |
708 | 532 | } |
709 | 386 | } |
710 | 390 | } |
711 | 311 | if ((UNLIKELY(UINT_MAX - input_rows_count < res_reserve_size))) { |
712 | 0 | return Status::BufferAllocFailed("function char output is too large to allocate"); |
713 | 0 | } |
714 | 311 | ColumnString::check_chars_length(res_reserve_size, 0); |
715 | 311 | res_data.resize(res_reserve_size); |
716 | 311 | res_offset.resize(input_rows_count); |
717 | | |
718 | 666 | for (size_t i = 0; i < input_rows_count; ++i) { |
719 | 355 | int current_length = 0; |
720 | 915 | for (size_t j = 0; j < argument_size - 1; ++j) { |
721 | 560 | if (!str_columns[j]) { |
722 | 0 | continue; |
723 | 0 | } |
724 | 560 | if (auto const_column = check_and_get_column<const ColumnConst>(*str_columns[j])) { |
725 | 28 | auto str_column = assert_cast<const ColumnString*, TypeCheckOnRelease::DISABLE>( |
726 | 28 | &(const_column->get_data_column())); |
727 | 28 | auto data_item = str_column->get_data_at(0); |
728 | 28 | memcpy_small_allow_read_write_overflow15( |
729 | 28 | &res_data[res_offset[i - 1]] + current_length, data_item.data, |
730 | 28 | data_item.size); |
731 | 28 | current_length += data_item.size; |
732 | 532 | } else { |
733 | 532 | auto& current_offsets = *offsets_list[j]; |
734 | 532 | auto& current_chars = *chars_list[j]; |
735 | | |
736 | 532 | int size = current_offsets[i] - current_offsets[i - 1]; |
737 | 532 | if (size > 0) { |
738 | 509 | memcpy_small_allow_read_write_overflow15( |
739 | 509 | &res_data[res_offset[i - 1]] + current_length, |
740 | 509 | ¤t_chars[current_offsets[i - 1]], size); |
741 | 509 | current_length += size; |
742 | 509 | } |
743 | 532 | } |
744 | 560 | } |
745 | 355 | res_offset[i] = res_offset[i - 1] + current_length; |
746 | 355 | } |
747 | | |
748 | | // validate utf8 |
749 | 311 | auto* null_map_data = null_map->get_data().data(); |
750 | 666 | for (size_t i = 0; i < input_rows_count; ++i) { |
751 | 355 | if (!validate_utf8((const char*)(&res_data[res_offset[i - 1]]), |
752 | 355 | res_offset[i] - res_offset[i - 1])) { |
753 | 136 | null_map_data[i] = 1; |
754 | 136 | } |
755 | 355 | } |
756 | | |
757 | 311 | block.get_by_position(result).column = |
758 | 311 | ColumnNullable::create(std::move(res), std::move(null_map)); |
759 | 311 | return Status::OK(); |
760 | 311 | } |
761 | | |
762 | | private: |
763 | | void integer_to_char_(int line_num, const int* num, ColumnString::Chars& chars, |
764 | 513 | IColumn::Offsets& offsets) const { |
765 | 513 | if (0 == *num) { |
766 | 26 | chars.push_back('\0'); |
767 | 26 | offsets[line_num] = offsets[line_num - 1] + 1; |
768 | 26 | return; |
769 | 26 | } |
770 | 487 | const char* bytes = (const char*)(num); |
771 | 487 | if constexpr (std::endian::native == std::endian::little) { |
772 | 487 | int k = 3; |
773 | 1.87k | for (; k >= 0; --k) { |
774 | 1.87k | if (bytes[k]) { |
775 | 487 | break; |
776 | 487 | } |
777 | 1.87k | } |
778 | 487 | offsets[line_num] = offsets[line_num - 1] + k + 1; |
779 | 1.05k | for (; k >= 0; --k) { |
780 | 565 | chars.push_back(bytes[k] ? bytes[k] : '\0'); |
781 | 565 | } |
782 | | } else if constexpr (std::endian::native == std::endian::big) { |
783 | | int k = 0; |
784 | | for (; k < 4; ++k) { |
785 | | if (bytes[k]) { |
786 | | break; |
787 | | } |
788 | | } |
789 | | offsets[line_num] = offsets[line_num - 1] + 4 - k; |
790 | | for (; k < 4; ++k) { |
791 | | chars.push_back(bytes[k] ? bytes[k] : '\0'); |
792 | | } |
793 | | } else { |
794 | | static_assert(std::endian::native == std::endian::big || |
795 | | std::endian::native == std::endian::little, |
796 | | "Unsupported endianness"); |
797 | | } |
798 | 487 | } |
799 | | }; |
800 | | |
801 | | class FunctionNgramSearch : public IFunction { |
802 | | public: |
803 | | static constexpr auto name = "ngram_search"; |
804 | 24 | static FunctionPtr create() { return std::make_shared<FunctionNgramSearch>(); } |
805 | 1 | String get_name() const override { return name; } |
806 | 15 | size_t get_number_of_arguments() const override { return 3; } |
807 | 15 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
808 | 15 | return std::make_shared<DataTypeFloat64>(); |
809 | 15 | } |
810 | | |
811 | | // ngram_search(text,pattern,gram_num) |
812 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
813 | 14 | uint32_t result, size_t input_rows_count) const override { |
814 | 14 | CHECK_EQ(arguments.size(), 3); |
815 | 14 | auto col_res = ColumnFloat64::create(); |
816 | 14 | bool col_const[3]; |
817 | 14 | ColumnPtr argument_columns[3]; |
818 | 56 | for (int i = 0; i < 3; ++i) { |
819 | 42 | std::tie(argument_columns[i], col_const[i]) = |
820 | 42 | unpack_if_const(block.get_by_position(arguments[i]).column); |
821 | 42 | } |
822 | 14 | auto pattern = assert_cast<const ColumnString*>(argument_columns[1].get())->get_data_at(0); |
823 | 14 | auto gram_num = assert_cast<const ColumnInt32*>(argument_columns[2].get())->get_element(0); |
824 | 14 | const auto* text_col = assert_cast<const ColumnString*>(argument_columns[0].get()); |
825 | | |
826 | 14 | if (col_const[0]) { |
827 | 0 | _execute_impl<true>(text_col, pattern, gram_num, *col_res, input_rows_count); |
828 | 14 | } else { |
829 | 14 | _execute_impl<false>(text_col, pattern, gram_num, *col_res, input_rows_count); |
830 | 14 | } |
831 | | |
832 | 14 | block.replace_by_position(result, std::move(col_res)); |
833 | 14 | return Status::OK(); |
834 | 14 | } |
835 | | |
836 | | private: |
837 | | using NgramMap = phmap::flat_hash_map<uint32_t, uint8_t>; |
838 | | constexpr static auto not_found = 0b00; |
839 | | constexpr static auto found_in_pattern = 0b01; |
840 | | constexpr static auto found_in_text = 0b10; |
841 | | constexpr static auto found_in_pattern_and_text = 0b11; |
842 | | |
843 | 173 | uint32_t sub_str_hash(const char* data, int32_t length) const { |
844 | 173 | constexpr static uint32_t seed = 0; |
845 | 173 | return crc32c::Extend(seed, (const uint8_t*)data, length); |
846 | 173 | } |
847 | | |
848 | | template <bool column_const> |
849 | | void _execute_impl(const ColumnString* text_col, StringRef& pattern, int gram_num, |
850 | 14 | ColumnFloat64& res, size_t size) const { |
851 | 14 | auto& res_data = res.get_data(); |
852 | 14 | res_data.resize_fill(size, 0); |
853 | | // If the length of the pattern is less than gram_num, return 0. |
854 | 14 | if (pattern.size < gram_num) { |
855 | 0 | return; |
856 | 0 | } |
857 | | |
858 | | // Build a map by pattern string, which will be used repeatedly in the following loop. |
859 | 14 | NgramMap pattern_map; |
860 | 14 | int pattern_count = get_pattern_set(pattern_map, pattern, gram_num); |
861 | | // Each time a loop is executed, the map will be modified, so it needs to be restored afterward. |
862 | 14 | std::vector<uint32_t> restore_map; |
863 | | |
864 | 35 | for (int i = 0; i < size; i++) { |
865 | 21 | auto text = text_col->get_data_at(index_check_const<column_const>(i)); |
866 | 21 | if (text.size < gram_num) { |
867 | | // If the length of the text is less than gram_num, return 0. |
868 | 4 | continue; |
869 | 4 | } |
870 | 17 | restore_map.reserve(text.size); |
871 | 17 | auto [text_count, intersection_count] = |
872 | 17 | get_text_set(text, gram_num, pattern_map, restore_map); |
873 | | |
874 | | // 2 * |Intersection| / (|text substr set| + |pattern substr set|) |
875 | 17 | res_data[i] = 2.0 * intersection_count / (text_count + pattern_count); |
876 | 17 | } |
877 | 14 | } Unexecuted instantiation: _ZNK5doris19FunctionNgramSearch13_execute_implILb1EEEvPKNS_9ColumnStrIjEERNS_9StringRefEiRNS_12ColumnVectorILNS_13PrimitiveTypeE9EEEm _ZNK5doris19FunctionNgramSearch13_execute_implILb0EEEvPKNS_9ColumnStrIjEERNS_9StringRefEiRNS_12ColumnVectorILNS_13PrimitiveTypeE9EEEm Line | Count | Source | 850 | 14 | ColumnFloat64& res, size_t size) const { | 851 | 14 | auto& res_data = res.get_data(); | 852 | 14 | res_data.resize_fill(size, 0); | 853 | | // If the length of the pattern is less than gram_num, return 0. | 854 | 14 | if (pattern.size < gram_num) { | 855 | 0 | return; | 856 | 0 | } | 857 | | | 858 | | // Build a map by pattern string, which will be used repeatedly in the following loop. | 859 | 14 | NgramMap pattern_map; | 860 | 14 | int pattern_count = get_pattern_set(pattern_map, pattern, gram_num); | 861 | | // Each time a loop is executed, the map will be modified, so it needs to be restored afterward. | 862 | 14 | std::vector<uint32_t> restore_map; | 863 | | | 864 | 35 | for (int i = 0; i < size; i++) { | 865 | 21 | auto text = text_col->get_data_at(index_check_const<column_const>(i)); | 866 | 21 | if (text.size < gram_num) { | 867 | | // If the length of the text is less than gram_num, return 0. | 868 | 4 | continue; | 869 | 4 | } | 870 | 17 | restore_map.reserve(text.size); | 871 | 17 | auto [text_count, intersection_count] = | 872 | 17 | get_text_set(text, gram_num, pattern_map, restore_map); | 873 | | | 874 | | // 2 * |Intersection| / (|text substr set| + |pattern substr set|) | 875 | 17 | res_data[i] = 2.0 * intersection_count / (text_count + pattern_count); | 876 | 17 | } | 877 | 14 | } |
|
878 | | |
879 | 14 | size_t get_pattern_set(NgramMap& pattern_map, StringRef& pattern, int gram_num) const { |
880 | 14 | size_t pattern_count = 0; |
881 | 87 | for (int i = 0; i + gram_num <= pattern.size; i++) { |
882 | 73 | uint32_t cur_hash = sub_str_hash(pattern.data + i, gram_num); |
883 | 73 | if (!pattern_map.contains(cur_hash)) { |
884 | 43 | pattern_map[cur_hash] = found_in_pattern; |
885 | 43 | pattern_count++; |
886 | 43 | } |
887 | 73 | } |
888 | 14 | return pattern_count; |
889 | 14 | } |
890 | | |
891 | | std::pair<size_t, size_t> get_text_set(StringRef& text, int gram_num, NgramMap& pattern_map, |
892 | 17 | std::vector<uint32_t>& restore_map) const { |
893 | 17 | restore_map.clear(); |
894 | | //intersection_count indicates a substring both in pattern and text. |
895 | 17 | size_t text_count = 0, intersection_count = 0; |
896 | 117 | for (int i = 0; i + gram_num <= text.size; i++) { |
897 | 100 | uint32_t cur_hash = sub_str_hash(text.data + i, gram_num); |
898 | 100 | auto& val = pattern_map[cur_hash]; |
899 | 100 | if (val == not_found) { |
900 | 26 | val ^= found_in_text; |
901 | 26 | DCHECK(val == found_in_text); |
902 | | // only found in text |
903 | 26 | text_count++; |
904 | 26 | restore_map.push_back(cur_hash); |
905 | 74 | } else if (val == found_in_pattern) { |
906 | 39 | val ^= found_in_text; |
907 | 39 | DCHECK(val == found_in_pattern_and_text); |
908 | | // found in text and pattern |
909 | 39 | text_count++; |
910 | 39 | intersection_count++; |
911 | 39 | restore_map.push_back(cur_hash); |
912 | 39 | } |
913 | 100 | } |
914 | | // Restore the pattern_map. |
915 | 65 | for (auto& restore_hash : restore_map) { |
916 | 65 | pattern_map[restore_hash] ^= found_in_text; |
917 | 65 | } |
918 | | |
919 | 17 | return {text_count, intersection_count}; |
920 | 17 | } |
921 | | }; |
922 | | |
923 | | class FunctionTranslate : public IFunction { |
924 | | public: |
925 | | static constexpr auto name = "translate"; |
926 | | using AsciiMap = std::array<UInt8, 128>; |
927 | | constexpr static UInt8 DELETE_CHAR = 255; // 255 means delete this char |
928 | 100 | static FunctionPtr create() { return std::make_shared<FunctionTranslate>(); } |
929 | 1 | String get_name() const override { return name; } |
930 | 91 | size_t get_number_of_arguments() const override { return 3; } |
931 | | |
932 | 91 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
933 | 91 | return std::make_shared<DataTypeString>(); |
934 | 91 | }; |
935 | | |
936 | 8 | DataTypes get_variadic_argument_types_impl() const override { |
937 | 8 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
938 | 8 | std::make_shared<DataTypeString>()}; |
939 | 8 | } |
940 | | |
941 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
942 | 175 | uint32_t result, size_t input_rows_count) const override { |
943 | 175 | CHECK_EQ(arguments.size(), 3); |
944 | 175 | auto col_res = ColumnString::create(); |
945 | 175 | bool col_const[3]; |
946 | 175 | ColumnPtr argument_columns[3]; |
947 | 700 | for (int i = 0; i < 3; ++i) { |
948 | 525 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
949 | 525 | } |
950 | 175 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
951 | 20 | *block.get_by_position(arguments[0]).column) |
952 | 20 | .convert_to_full_column() |
953 | 175 | : block.get_by_position(arguments[0]).column; |
954 | 175 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); |
955 | | |
956 | 175 | const auto* col_source = assert_cast<const ColumnString*>(argument_columns[0].get()); |
957 | 175 | const auto* col_from = assert_cast<const ColumnString*>(argument_columns[1].get()); |
958 | 175 | const auto* col_to = assert_cast<const ColumnString*>(argument_columns[2].get()); |
959 | | |
960 | 175 | bool is_ascii = col_source->is_ascii() && col_from->is_ascii() && col_to->is_ascii(); |
961 | 175 | auto impl_vectors = impl_vectors_utf8<false>; |
962 | 175 | if (col_const[1] && col_const[2] && is_ascii) { |
963 | 34 | impl_vectors = impl_vectors_ascii<true>; |
964 | 141 | } else if (col_const[1] && col_const[2]) { |
965 | 1 | impl_vectors = impl_vectors_utf8<true>; |
966 | 140 | } else if (is_ascii) { |
967 | 88 | impl_vectors = impl_vectors_ascii<false>; |
968 | 88 | } |
969 | 175 | impl_vectors(col_source, col_from, col_to, col_res.get()); |
970 | 175 | block.get_by_position(result).column = std::move(col_res); |
971 | 175 | return Status::OK(); |
972 | 175 | } |
973 | | |
974 | | private: |
975 | | template <bool IsConst> |
976 | | static void impl_vectors_ascii(const ColumnString* col_source, const ColumnString* col_from, |
977 | 122 | const ColumnString* col_to, ColumnString* col_res) { |
978 | 122 | auto& res_chars = col_res->get_chars(); |
979 | 122 | auto& res_offsets = col_res->get_offsets(); |
980 | 122 | res_chars.reserve(col_source->get_chars().size()); |
981 | 122 | res_offsets.reserve(col_source->get_offsets().size()); |
982 | 122 | DCHECK_EQ(col_res->size(), 0); |
983 | 122 | AsciiMap map; |
984 | 122 | if (IsConst) { |
985 | 34 | const auto& from_str = col_from->get_data_at(0); |
986 | 34 | const auto& to_str = col_to->get_data_at(0); |
987 | 34 | if (!build_translate_map_ascii(map, from_str, to_str)) { |
988 | | // if the map is not need delete char, we can directly copy the source string,then use map to translate |
989 | 24 | res_offsets.insert(col_source->get_offsets().begin(), |
990 | 24 | col_source->get_offsets().end()); |
991 | 24 | res_chars.insert(col_source->get_chars().begin(), col_source->get_chars().end()); |
992 | 214 | for (int i = 0; i < res_chars.size(); ++i) { |
993 | 190 | res_chars[i] = map[res_chars[i]]; // translate the chars |
994 | 190 | } |
995 | 24 | return; // no need to translate |
996 | 24 | } |
997 | 34 | } |
998 | | |
999 | 98 | auto res_size = 0; |
1000 | 98 | auto* begin_data = col_res->get_chars().data(); |
1001 | 216 | for (size_t i = 0; i < col_source->size(); ++i) { |
1002 | 118 | const auto& source_str = col_source->get_data_at(i); |
1003 | 118 | if (!IsConst) { |
1004 | 104 | const auto& from_str = col_from->get_data_at(i); |
1005 | 104 | const auto& to_str = col_to->get_data_at(i); |
1006 | 104 | build_translate_map_ascii(map, from_str, to_str); |
1007 | 104 | } |
1008 | 118 | auto* dst_data = begin_data + res_size; |
1009 | 118 | res_size += translate_ascii(source_str, map, dst_data); |
1010 | | |
1011 | 118 | res_offsets.push_back(res_size); |
1012 | 118 | } |
1013 | 98 | DCHECK_GE(res_chars.capacity(), res_size); |
1014 | 98 | res_chars.resize(res_size); |
1015 | 98 | } _ZN5doris17FunctionTranslate18impl_vectors_asciiILb1EEEvPKNS_9ColumnStrIjEES5_S5_PS3_ Line | Count | Source | 977 | 34 | const ColumnString* col_to, ColumnString* col_res) { | 978 | 34 | auto& res_chars = col_res->get_chars(); | 979 | 34 | auto& res_offsets = col_res->get_offsets(); | 980 | 34 | res_chars.reserve(col_source->get_chars().size()); | 981 | 34 | res_offsets.reserve(col_source->get_offsets().size()); | 982 | 34 | DCHECK_EQ(col_res->size(), 0); | 983 | 34 | AsciiMap map; | 984 | 34 | if (IsConst) { | 985 | 34 | const auto& from_str = col_from->get_data_at(0); | 986 | 34 | const auto& to_str = col_to->get_data_at(0); | 987 | 34 | if (!build_translate_map_ascii(map, from_str, to_str)) { | 988 | | // if the map is not need delete char, we can directly copy the source string,then use map to translate | 989 | 24 | res_offsets.insert(col_source->get_offsets().begin(), | 990 | 24 | col_source->get_offsets().end()); | 991 | 24 | res_chars.insert(col_source->get_chars().begin(), col_source->get_chars().end()); | 992 | 214 | for (int i = 0; i < res_chars.size(); ++i) { | 993 | 190 | res_chars[i] = map[res_chars[i]]; // translate the chars | 994 | 190 | } | 995 | 24 | return; // no need to translate | 996 | 24 | } | 997 | 34 | } | 998 | | | 999 | 10 | auto res_size = 0; | 1000 | 10 | auto* begin_data = col_res->get_chars().data(); | 1001 | 24 | for (size_t i = 0; i < col_source->size(); ++i) { | 1002 | 14 | const auto& source_str = col_source->get_data_at(i); | 1003 | 14 | if (!IsConst) { | 1004 | 0 | const auto& from_str = col_from->get_data_at(i); | 1005 | 0 | const auto& to_str = col_to->get_data_at(i); | 1006 | 0 | build_translate_map_ascii(map, from_str, to_str); | 1007 | 0 | } | 1008 | 14 | auto* dst_data = begin_data + res_size; | 1009 | 14 | res_size += translate_ascii(source_str, map, dst_data); | 1010 | | | 1011 | 14 | res_offsets.push_back(res_size); | 1012 | 14 | } | 1013 | | DCHECK_GE(res_chars.capacity(), res_size); | 1014 | 10 | res_chars.resize(res_size); | 1015 | 10 | } |
_ZN5doris17FunctionTranslate18impl_vectors_asciiILb0EEEvPKNS_9ColumnStrIjEES5_S5_PS3_ Line | Count | Source | 977 | 88 | const ColumnString* col_to, ColumnString* col_res) { | 978 | 88 | auto& res_chars = col_res->get_chars(); | 979 | 88 | auto& res_offsets = col_res->get_offsets(); | 980 | 88 | res_chars.reserve(col_source->get_chars().size()); | 981 | 88 | res_offsets.reserve(col_source->get_offsets().size()); | 982 | 88 | DCHECK_EQ(col_res->size(), 0); | 983 | 88 | AsciiMap map; | 984 | 88 | if (IsConst) { | 985 | 0 | const auto& from_str = col_from->get_data_at(0); | 986 | 0 | const auto& to_str = col_to->get_data_at(0); | 987 | 0 | if (!build_translate_map_ascii(map, from_str, to_str)) { | 988 | | // if the map is not need delete char, we can directly copy the source string,then use map to translate | 989 | 0 | res_offsets.insert(col_source->get_offsets().begin(), | 990 | 0 | col_source->get_offsets().end()); | 991 | 0 | res_chars.insert(col_source->get_chars().begin(), col_source->get_chars().end()); | 992 | 0 | for (int i = 0; i < res_chars.size(); ++i) { | 993 | 0 | res_chars[i] = map[res_chars[i]]; // translate the chars | 994 | 0 | } | 995 | 0 | return; // no need to translate | 996 | 0 | } | 997 | 0 | } | 998 | | | 999 | 88 | auto res_size = 0; | 1000 | 88 | auto* begin_data = col_res->get_chars().data(); | 1001 | 192 | for (size_t i = 0; i < col_source->size(); ++i) { | 1002 | 104 | const auto& source_str = col_source->get_data_at(i); | 1003 | 104 | if (!IsConst) { | 1004 | 104 | const auto& from_str = col_from->get_data_at(i); | 1005 | 104 | const auto& to_str = col_to->get_data_at(i); | 1006 | 104 | build_translate_map_ascii(map, from_str, to_str); | 1007 | 104 | } | 1008 | 104 | auto* dst_data = begin_data + res_size; | 1009 | 104 | res_size += translate_ascii(source_str, map, dst_data); | 1010 | | | 1011 | 104 | res_offsets.push_back(res_size); | 1012 | 104 | } | 1013 | | DCHECK_GE(res_chars.capacity(), res_size); | 1014 | 88 | res_chars.resize(res_size); | 1015 | 88 | } |
|
1016 | | |
1017 | | // return true if no need delete char |
1018 | | bool static build_translate_map_ascii(AsciiMap& map, const StringRef& from_str, |
1019 | 138 | const StringRef& to_str) { |
1020 | 17.8k | for (size_t i = 0; i < map.size(); ++i) { |
1021 | 17.6k | map[i] = i; // initialize map to identity |
1022 | 17.6k | } |
1023 | 138 | std::array<UInt8, 128> set_map {0}; |
1024 | 138 | const auto min_size = std::min(from_str.size, to_str.size); |
1025 | | // all ascii characters are in the range [0, 127] |
1026 | 476 | for (size_t i = 0; i < min_size; ++i) { |
1027 | 338 | auto from_char = from_str.data[i]; |
1028 | 338 | auto to_char = to_str.data[i]; |
1029 | 338 | if (set_map[from_char] == 0) { |
1030 | 243 | set_map[from_char] = 1; |
1031 | 243 | map[from_char] = to_char; |
1032 | 243 | } |
1033 | 338 | } |
1034 | | |
1035 | 138 | bool need_delete_char = false; |
1036 | | |
1037 | 207 | for (size_t i = min_size; i < from_str.size; ++i) { |
1038 | 69 | auto from_char = from_str.data[i]; |
1039 | 69 | if (set_map[from_char] == 0) { |
1040 | 57 | set_map[from_char] = 1; |
1041 | 57 | map[from_char] = DELETE_CHAR; // delete this char |
1042 | 57 | need_delete_char = true; |
1043 | 57 | } |
1044 | 69 | } |
1045 | 138 | return need_delete_char; |
1046 | 138 | } |
1047 | | |
1048 | 118 | static size_t translate_ascii(const StringRef& source_str, AsciiMap& map, UInt8* dst_data) { |
1049 | 118 | auto* begin_data = dst_data; |
1050 | 640 | for (size_t i = 0; i < source_str.size; ++i) { |
1051 | 522 | auto c = source_str.data[i]; |
1052 | 522 | if (map[c] == DELETE_CHAR) { |
1053 | 35 | continue; // delete this char |
1054 | 35 | } |
1055 | 487 | *dst_data++ = map[c]; |
1056 | 487 | } |
1057 | 118 | return dst_data - begin_data; |
1058 | 118 | } |
1059 | | |
1060 | | template <bool IsConst> |
1061 | | static void impl_vectors_utf8(const ColumnString* col_source, const ColumnString* col_from, |
1062 | 53 | const ColumnString* col_to, ColumnString* col_res) { |
1063 | 53 | col_res->get_chars().reserve(col_source->get_chars().size()); |
1064 | 53 | col_res->get_offsets().reserve(col_source->get_offsets().size()); |
1065 | 53 | std::unordered_map<std::string_view, std::string_view> translate_map; |
1066 | 53 | if (IsConst) { |
1067 | 1 | const auto& from_str = col_from->get_data_at(0); |
1068 | 1 | const auto& to_str = col_to->get_data_at(0); |
1069 | 1 | translate_map = |
1070 | 1 | build_translate_map_utf8(from_str.to_string_view(), to_str.to_string_view()); |
1071 | 1 | } |
1072 | 344 | for (size_t i = 0; i < col_source->size(); ++i) { |
1073 | 291 | const auto& source_str = col_source->get_data_at(i); |
1074 | 291 | if (!IsConst) { |
1075 | 290 | const auto& from_str = col_from->get_data_at(i); |
1076 | 290 | const auto& to_str = col_to->get_data_at(i); |
1077 | 290 | translate_map = build_translate_map_utf8(from_str.to_string_view(), |
1078 | 290 | to_str.to_string_view()); |
1079 | 290 | } |
1080 | 291 | auto translated_str = translate_utf8(source_str.to_string_view(), translate_map); |
1081 | 291 | col_res->insert_data(translated_str.data(), translated_str.size()); |
1082 | 291 | } |
1083 | 53 | } _ZN5doris17FunctionTranslate17impl_vectors_utf8ILb0EEEvPKNS_9ColumnStrIjEES5_S5_PS3_ Line | Count | Source | 1062 | 52 | const ColumnString* col_to, ColumnString* col_res) { | 1063 | 52 | col_res->get_chars().reserve(col_source->get_chars().size()); | 1064 | 52 | col_res->get_offsets().reserve(col_source->get_offsets().size()); | 1065 | 52 | std::unordered_map<std::string_view, std::string_view> translate_map; | 1066 | 52 | if (IsConst) { | 1067 | 0 | const auto& from_str = col_from->get_data_at(0); | 1068 | 0 | const auto& to_str = col_to->get_data_at(0); | 1069 | 0 | translate_map = | 1070 | 0 | build_translate_map_utf8(from_str.to_string_view(), to_str.to_string_view()); | 1071 | 0 | } | 1072 | 342 | for (size_t i = 0; i < col_source->size(); ++i) { | 1073 | 290 | const auto& source_str = col_source->get_data_at(i); | 1074 | 290 | if (!IsConst) { | 1075 | 290 | const auto& from_str = col_from->get_data_at(i); | 1076 | 290 | const auto& to_str = col_to->get_data_at(i); | 1077 | 290 | translate_map = build_translate_map_utf8(from_str.to_string_view(), | 1078 | 290 | to_str.to_string_view()); | 1079 | 290 | } | 1080 | 290 | auto translated_str = translate_utf8(source_str.to_string_view(), translate_map); | 1081 | 290 | col_res->insert_data(translated_str.data(), translated_str.size()); | 1082 | 290 | } | 1083 | 52 | } |
_ZN5doris17FunctionTranslate17impl_vectors_utf8ILb1EEEvPKNS_9ColumnStrIjEES5_S5_PS3_ Line | Count | Source | 1062 | 1 | const ColumnString* col_to, ColumnString* col_res) { | 1063 | 1 | col_res->get_chars().reserve(col_source->get_chars().size()); | 1064 | 1 | col_res->get_offsets().reserve(col_source->get_offsets().size()); | 1065 | 1 | std::unordered_map<std::string_view, std::string_view> translate_map; | 1066 | 1 | if (IsConst) { | 1067 | 1 | const auto& from_str = col_from->get_data_at(0); | 1068 | 1 | const auto& to_str = col_to->get_data_at(0); | 1069 | 1 | translate_map = | 1070 | 1 | build_translate_map_utf8(from_str.to_string_view(), to_str.to_string_view()); | 1071 | 1 | } | 1072 | 2 | for (size_t i = 0; i < col_source->size(); ++i) { | 1073 | 1 | const auto& source_str = col_source->get_data_at(i); | 1074 | 1 | if (!IsConst) { | 1075 | 0 | const auto& from_str = col_from->get_data_at(i); | 1076 | 0 | const auto& to_str = col_to->get_data_at(i); | 1077 | 0 | translate_map = build_translate_map_utf8(from_str.to_string_view(), | 1078 | 0 | to_str.to_string_view()); | 1079 | 0 | } | 1080 | 1 | auto translated_str = translate_utf8(source_str.to_string_view(), translate_map); | 1081 | 1 | col_res->insert_data(translated_str.data(), translated_str.size()); | 1082 | 1 | } | 1083 | 1 | } |
|
1084 | | |
1085 | | static std::unordered_map<std::string_view, std::string_view> build_translate_map_utf8( |
1086 | 291 | const std::string_view& from_str, const std::string_view& to_str) { |
1087 | 291 | std::unordered_map<std::string_view, std::string_view> translate_map; |
1088 | 1.77k | for (size_t i = 0, from_char_size = 0, j = 0, to_char_size = 0; i < from_str.size(); |
1089 | 1.48k | i += from_char_size, j += to_char_size) { |
1090 | 1.48k | from_char_size = get_utf8_byte_length(from_str[i]); |
1091 | 1.48k | to_char_size = j < to_str.size() ? get_utf8_byte_length(to_str[j]) : 0; |
1092 | 1.48k | auto from_char = from_str.substr(i, from_char_size); |
1093 | 1.48k | if (translate_map.find(from_char) == translate_map.end()) { |
1094 | 799 | translate_map[from_char] = |
1095 | 799 | j < to_str.size() ? to_str.substr(j, to_char_size) : std::string_view(); |
1096 | 799 | } |
1097 | 1.48k | } |
1098 | 291 | return translate_map; |
1099 | 291 | } |
1100 | | |
1101 | | static std::string translate_utf8( |
1102 | | const std::string_view& source_str, |
1103 | 291 | std::unordered_map<std::string_view, std::string_view>& translate_map) { |
1104 | 291 | std::string result; |
1105 | 291 | result.reserve(source_str.size()); |
1106 | 1.71k | for (size_t i = 0, char_size = 0; i < source_str.size(); i += char_size) { |
1107 | 1.42k | char_size = get_utf8_byte_length(source_str[i]); |
1108 | 1.42k | auto c = source_str.substr(i, char_size); |
1109 | 1.42k | if (translate_map.find(c) != translate_map.end()) { |
1110 | 255 | if (!translate_map[c].empty()) { |
1111 | 159 | result.append(translate_map[c]); |
1112 | 159 | } |
1113 | 1.17k | } else { |
1114 | 1.17k | result.append(c); |
1115 | 1.17k | } |
1116 | 1.42k | } |
1117 | 291 | return result; |
1118 | 291 | } |
1119 | | }; |
1120 | | |
1121 | | /// xpath_string(xml, xpath) -> String |
1122 | | /// Returns the text content of the first node that matches the XPath expression. |
1123 | | /// Returns NULL if either xml or xpath is NULL. |
1124 | | /// Returns empty string if the XPath expression matches no nodes. |
1125 | | /// The text content includes the node and all its descendants. |
1126 | | /// Example: |
1127 | | /// xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/b[1]') = 'b1' |
1128 | | /// xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/b[2]') = 'b2' |
1129 | | /// xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/c') = '' |
1130 | | /// xpath_string('invalid xml', '/a/b[1]') = NULL |
1131 | | /// xpath_string(NULL, '/a/b[1]') = NULL |
1132 | | /// xpath_string('<a><b>b1</b><b>b2</b></a>', NULL) = NULL |
1133 | | class FunctionXPathString : public IFunction { |
1134 | | public: |
1135 | | static constexpr auto name = "xpath_string"; |
1136 | 173 | static FunctionPtr create() { return std::make_shared<FunctionXPathString>(); } |
1137 | 1 | String get_name() const override { return name; } |
1138 | 164 | size_t get_number_of_arguments() const override { return 2; } |
1139 | 164 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
1140 | 164 | return make_nullable(std::make_shared<DataTypeString>()); |
1141 | 164 | } |
1142 | | |
1143 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
1144 | 246 | uint32_t result, size_t input_rows_count) const override { |
1145 | 246 | CHECK_EQ(arguments.size(), 2); |
1146 | 246 | auto col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); |
1147 | 246 | const auto& [left_col, left_const] = |
1148 | 246 | unpack_if_const(block.get_by_position(arguments[0]).column); |
1149 | 246 | const auto& [right_col, right_const] = |
1150 | 246 | unpack_if_const(block.get_by_position(arguments[1]).column); |
1151 | 246 | const auto& xml_col = *assert_cast<const ColumnString*>(left_col.get()); |
1152 | 246 | const auto& xpath_col = *assert_cast<const ColumnString*>(right_col.get()); |
1153 | | |
1154 | 246 | Status status; |
1155 | 246 | if (left_const && right_const) { |
1156 | 0 | status = execute_vector<true, true>(input_rows_count, xml_col, xpath_col, *col_res); |
1157 | 246 | } else if (left_const) { |
1158 | 42 | status = execute_vector<true, false>(input_rows_count, xml_col, xpath_col, *col_res); |
1159 | 204 | } else if (right_const) { |
1160 | 51 | status = execute_vector<false, true>(input_rows_count, xml_col, xpath_col, *col_res); |
1161 | 153 | } else { |
1162 | 153 | status = execute_vector<false, false>(input_rows_count, xml_col, xpath_col, *col_res); |
1163 | 153 | } |
1164 | 246 | if (!status.ok()) { |
1165 | 1 | return status; |
1166 | 1 | } |
1167 | | |
1168 | 245 | block.get_by_position(result).column = std::move(col_res); |
1169 | 245 | return Status::OK(); |
1170 | 246 | } |
1171 | | |
1172 | | private: |
1173 | 331 | static Status parse_xml(const StringRef& xml_str, pugi::xml_document& xml_doc) { |
1174 | 331 | pugi::xml_parse_result result = xml_doc.load_buffer(xml_str.data, xml_str.size); |
1175 | 331 | if (!result) { |
1176 | 1 | return Status::InvalidArgument("Function {} failed to parse XML string: {}", name, |
1177 | 1 | result.description()); |
1178 | 1 | } |
1179 | 330 | return Status::OK(); |
1180 | 331 | } |
1181 | | |
1182 | 340 | static Status build_xpath_query(const StringRef& xpath_str, pugi::xpath_query& xpath_query) { |
1183 | | // xpath_query will throws xpath_exception on compilation errors. |
1184 | 340 | try { |
1185 | | // NOTE!!!: don't use to_string_view(), because xpath_str maybe not null-terminated |
1186 | 340 | xpath_query = pugi::xpath_query(xpath_str.to_string().c_str()); |
1187 | 340 | } catch (const pugi::xpath_exception& e) { |
1188 | 0 | return Status::InvalidArgument("Function {} failed to build XPath query: {}", name, |
1189 | 0 | e.what()); |
1190 | 0 | } |
1191 | 340 | return Status::OK(); |
1192 | 340 | } |
1193 | | |
1194 | | template <bool left_const, bool right_const> |
1195 | | static Status execute_vector(const size_t input_rows_count, const ColumnString& xml_col, |
1196 | 246 | const ColumnString& xpath_col, ColumnNullable& res_col) { |
1197 | 246 | pugi::xml_document xml_doc; |
1198 | 246 | pugi::xpath_query xpath_query; |
1199 | | // first check right_const, because we want to check empty input first |
1200 | 246 | if constexpr (right_const) { |
1201 | 51 | auto xpath_str = xpath_col.get_data_at(0); |
1202 | 51 | if (xpath_str.empty()) { |
1203 | | // should return null if xpath_str is empty |
1204 | 1 | res_col.insert_many_defaults(input_rows_count); |
1205 | 1 | return Status::OK(); |
1206 | 1 | } |
1207 | 50 | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); |
1208 | 50 | } |
1209 | 50 | if constexpr (left_const) { |
1210 | 42 | auto xml_str = xml_col.get_data_at(0); |
1211 | 42 | if (xml_str.empty()) { |
1212 | | // should return null if xml_str is empty |
1213 | 1 | res_col.insert_many_defaults(input_rows_count); |
1214 | 1 | return Status::OK(); |
1215 | 1 | } |
1216 | 41 | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); |
1217 | 41 | } |
1218 | | |
1219 | 633 | for (size_t i = 0; i < input_rows_count; ++i) { |
1220 | 388 | if constexpr (!right_const) { |
1221 | 308 | auto xpath_str = xpath_col.get_data_at(i); |
1222 | 308 | if (xpath_str.empty()) { |
1223 | | // should return null if xpath_str is empty |
1224 | 18 | res_col.insert_default(); |
1225 | 18 | continue; |
1226 | 18 | } |
1227 | 290 | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); |
1228 | 290 | } |
1229 | 327 | if constexpr (!left_const) { |
1230 | 327 | auto xml_str = xml_col.get_data_at(i); |
1231 | 327 | if (xml_str.empty()) { |
1232 | | // should return null if xml_str is empty |
1233 | 20 | res_col.insert_default(); |
1234 | 20 | continue; |
1235 | 20 | } |
1236 | 307 | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); |
1237 | 307 | } |
1238 | 306 | std::string text; |
1239 | 388 | try { |
1240 | 388 | text = xpath_query.evaluate_string(xml_doc); |
1241 | 388 | } catch (const pugi::xpath_exception& e) { |
1242 | 0 | return Status::InvalidArgument("Function {} failed to query XPath string: {}", name, |
1243 | 0 | e.what()); |
1244 | 0 | } |
1245 | 349 | res_col.insert_data(text.data(), text.size()); |
1246 | 349 | } |
1247 | 245 | return Status::OK(); |
1248 | 246 | } Unexecuted instantiation: _ZN5doris19FunctionXPathString14execute_vectorILb1ELb1EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE _ZN5doris19FunctionXPathString14execute_vectorILb1ELb0EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE Line | Count | Source | 1196 | 42 | const ColumnString& xpath_col, ColumnNullable& res_col) { | 1197 | 42 | pugi::xml_document xml_doc; | 1198 | 42 | pugi::xpath_query xpath_query; | 1199 | | // first check right_const, because we want to check empty input first | 1200 | | if constexpr (right_const) { | 1201 | | auto xpath_str = xpath_col.get_data_at(0); | 1202 | | if (xpath_str.empty()) { | 1203 | | // should return null if xpath_str is empty | 1204 | | res_col.insert_many_defaults(input_rows_count); | 1205 | | return Status::OK(); | 1206 | | } | 1207 | | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); | 1208 | | } | 1209 | 42 | if constexpr (left_const) { | 1210 | 42 | auto xml_str = xml_col.get_data_at(0); | 1211 | 42 | if (xml_str.empty()) { | 1212 | | // should return null if xml_str is empty | 1213 | 1 | res_col.insert_many_defaults(input_rows_count); | 1214 | 1 | return Status::OK(); | 1215 | 1 | } | 1216 | 41 | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); | 1217 | 41 | } | 1218 | | | 1219 | 103 | for (size_t i = 0; i < input_rows_count; ++i) { | 1220 | 61 | if constexpr (!right_const) { | 1221 | 61 | auto xpath_str = xpath_col.get_data_at(i); | 1222 | 61 | if (xpath_str.empty()) { | 1223 | | // should return null if xpath_str is empty | 1224 | 1 | res_col.insert_default(); | 1225 | 1 | continue; | 1226 | 1 | } | 1227 | 60 | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); | 1228 | 60 | } | 1229 | | if constexpr (!left_const) { | 1230 | | auto xml_str = xml_col.get_data_at(i); | 1231 | | if (xml_str.empty()) { | 1232 | | // should return null if xml_str is empty | 1233 | | res_col.insert_default(); | 1234 | | continue; | 1235 | | } | 1236 | | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); | 1237 | | } | 1238 | 61 | std::string text; | 1239 | 61 | try { | 1240 | 61 | text = xpath_query.evaluate_string(xml_doc); | 1241 | 61 | } catch (const pugi::xpath_exception& e) { | 1242 | 0 | return Status::InvalidArgument("Function {} failed to query XPath string: {}", name, | 1243 | 0 | e.what()); | 1244 | 0 | } | 1245 | 60 | res_col.insert_data(text.data(), text.size()); | 1246 | 60 | } | 1247 | 42 | return Status::OK(); | 1248 | 42 | } |
_ZN5doris19FunctionXPathString14execute_vectorILb0ELb1EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE Line | Count | Source | 1196 | 51 | const ColumnString& xpath_col, ColumnNullable& res_col) { | 1197 | 51 | pugi::xml_document xml_doc; | 1198 | 51 | pugi::xpath_query xpath_query; | 1199 | | // first check right_const, because we want to check empty input first | 1200 | 51 | if constexpr (right_const) { | 1201 | 51 | auto xpath_str = xpath_col.get_data_at(0); | 1202 | 51 | if (xpath_str.empty()) { | 1203 | | // should return null if xpath_str is empty | 1204 | 1 | res_col.insert_many_defaults(input_rows_count); | 1205 | 1 | return Status::OK(); | 1206 | 1 | } | 1207 | 50 | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); | 1208 | 50 | } | 1209 | | if constexpr (left_const) { | 1210 | | auto xml_str = xml_col.get_data_at(0); | 1211 | | if (xml_str.empty()) { | 1212 | | // should return null if xml_str is empty | 1213 | | res_col.insert_many_defaults(input_rows_count); | 1214 | | return Status::OK(); | 1215 | | } | 1216 | | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); | 1217 | | } | 1218 | | | 1219 | 131 | for (size_t i = 0; i < input_rows_count; ++i) { | 1220 | | if constexpr (!right_const) { | 1221 | | auto xpath_str = xpath_col.get_data_at(i); | 1222 | | if (xpath_str.empty()) { | 1223 | | // should return null if xpath_str is empty | 1224 | | res_col.insert_default(); | 1225 | | continue; | 1226 | | } | 1227 | | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); | 1228 | | } | 1229 | 80 | if constexpr (!left_const) { | 1230 | 80 | auto xml_str = xml_col.get_data_at(i); | 1231 | 80 | if (xml_str.empty()) { | 1232 | | // should return null if xml_str is empty | 1233 | 5 | res_col.insert_default(); | 1234 | 5 | continue; | 1235 | 5 | } | 1236 | 75 | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); | 1237 | 75 | } | 1238 | 75 | std::string text; | 1239 | 80 | try { | 1240 | 80 | text = xpath_query.evaluate_string(xml_doc); | 1241 | 80 | } catch (const pugi::xpath_exception& e) { | 1242 | 0 | return Status::InvalidArgument("Function {} failed to query XPath string: {}", name, | 1243 | 0 | e.what()); | 1244 | 0 | } | 1245 | 75 | res_col.insert_data(text.data(), text.size()); | 1246 | 75 | } | 1247 | 51 | return Status::OK(); | 1248 | 51 | } |
_ZN5doris19FunctionXPathString14execute_vectorILb0ELb0EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE Line | Count | Source | 1196 | 153 | const ColumnString& xpath_col, ColumnNullable& res_col) { | 1197 | 153 | pugi::xml_document xml_doc; | 1198 | 153 | pugi::xpath_query xpath_query; | 1199 | | // first check right_const, because we want to check empty input first | 1200 | | if constexpr (right_const) { | 1201 | | auto xpath_str = xpath_col.get_data_at(0); | 1202 | | if (xpath_str.empty()) { | 1203 | | // should return null if xpath_str is empty | 1204 | | res_col.insert_many_defaults(input_rows_count); | 1205 | | return Status::OK(); | 1206 | | } | 1207 | | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); | 1208 | | } | 1209 | | if constexpr (left_const) { | 1210 | | auto xml_str = xml_col.get_data_at(0); | 1211 | | if (xml_str.empty()) { | 1212 | | // should return null if xml_str is empty | 1213 | | res_col.insert_many_defaults(input_rows_count); | 1214 | | return Status::OK(); | 1215 | | } | 1216 | | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); | 1217 | | } | 1218 | | | 1219 | 399 | for (size_t i = 0; i < input_rows_count; ++i) { | 1220 | 247 | if constexpr (!right_const) { | 1221 | 247 | auto xpath_str = xpath_col.get_data_at(i); | 1222 | 247 | if (xpath_str.empty()) { | 1223 | | // should return null if xpath_str is empty | 1224 | 17 | res_col.insert_default(); | 1225 | 17 | continue; | 1226 | 17 | } | 1227 | 230 | RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query)); | 1228 | 230 | } | 1229 | 247 | if constexpr (!left_const) { | 1230 | 247 | auto xml_str = xml_col.get_data_at(i); | 1231 | 247 | if (xml_str.empty()) { | 1232 | | // should return null if xml_str is empty | 1233 | 15 | res_col.insert_default(); | 1234 | 15 | continue; | 1235 | 15 | } | 1236 | 232 | RETURN_IF_ERROR(parse_xml(xml_str, xml_doc)); | 1237 | 232 | } | 1238 | 231 | std::string text; | 1239 | 247 | try { | 1240 | 247 | text = xpath_query.evaluate_string(xml_doc); | 1241 | 247 | } catch (const pugi::xpath_exception& e) { | 1242 | 0 | return Status::InvalidArgument("Function {} failed to query XPath string: {}", name, | 1243 | 0 | e.what()); | 1244 | 0 | } | 1245 | 214 | res_col.insert_data(text.data(), text.size()); | 1246 | 214 | } | 1247 | 152 | return Status::OK(); | 1248 | 153 | } |
|
1249 | | }; |
1250 | | |
1251 | | class MakeSetImpl { |
1252 | | public: |
1253 | | static constexpr auto name = "make_set"; |
1254 | | |
1255 | 0 | static size_t get_number_of_arguments() { return 0; } |
1256 | 36 | static bool is_variadic() { return true; } |
1257 | 35 | static DataTypePtr get_return_type_impl(const DataTypes& arguments) { |
1258 | 35 | if (arguments[0].get()->is_nullable()) { |
1259 | 12 | return make_nullable(std::make_shared<DataTypeString>()); |
1260 | 12 | } |
1261 | 23 | return std::make_shared<DataTypeString>(); |
1262 | 35 | } |
1263 | | |
1264 | | static bool is_return_nullable(bool has_nullable, |
1265 | 35 | const std::vector<ColumnWithConstAndNullMap>& cols_info) { |
1266 | 35 | return cols_info[0].null_map != nullptr; |
1267 | 35 | } |
1268 | | |
1269 | | static bool execute_const_null(ColumnString::MutablePtr& res_col, |
1270 | | PaddedPODArray<UInt8>& res_null_map_data, |
1271 | 2 | size_t input_rows_count, size_t null_index) { |
1272 | 2 | if (null_index == 1) { |
1273 | 0 | res_col->insert_many_defaults(input_rows_count); |
1274 | 0 | res_null_map_data.assign(input_rows_count, (UInt8)1); |
1275 | 0 | return true; |
1276 | 0 | } |
1277 | 2 | return false; |
1278 | 2 | } |
1279 | | |
1280 | | static void execute(const std::vector<ColumnWithConstAndNullMap>& column_infos, |
1281 | | ColumnString::MutablePtr& res_col, PaddedPODArray<UInt8>& res_null_map_data, |
1282 | 35 | size_t input_rows_count) { |
1283 | 35 | static constexpr char SEPARATOR = ','; |
1284 | 35 | const auto& bit_data = |
1285 | 35 | assert_cast<const ColumnInt64&>(*column_infos[0].nested_col).get_data(); |
1286 | 35 | std::vector<const ColumnString*> str_cols(column_infos.size()); |
1287 | 249 | for (size_t i = 1; i < column_infos.size(); ++i) { |
1288 | 214 | str_cols[i] = assert_cast<const ColumnString*>(column_infos[i].nested_col); |
1289 | 214 | } |
1290 | | |
1291 | 200 | for (size_t row = 0; row < input_rows_count; ++row) { |
1292 | 165 | if (column_infos[0].is_null_at(row)) { |
1293 | 10 | res_col->insert_default(); |
1294 | 10 | res_null_map_data[row] = 1; |
1295 | 10 | continue; |
1296 | 10 | } |
1297 | | |
1298 | 155 | uint64_t bit = bit_data[column_infos[0].is_const ? 0 : row]; |
1299 | 155 | uint64_t col_pos = __builtin_ffsll(bit); |
1300 | 155 | ColumnString::Chars data; |
1301 | 452 | while (col_pos != 0 && col_pos < column_infos.size() && bit != 0) { |
1302 | 297 | if (!column_infos[col_pos].is_null_at(row)) { |
1303 | | /* Here insert `str,` directly to support the case below: |
1304 | | * SELECT MAKE_SET(3, '', 'a'); |
1305 | | * the exception result should be ',a'. |
1306 | | */ |
1307 | 259 | auto s_ref = str_cols[col_pos]->get_data_at( |
1308 | 259 | column_infos[col_pos].is_const ? 0 : row); |
1309 | 259 | data.insert(s_ref.data, s_ref.data + s_ref.size); |
1310 | 259 | data.push_back(SEPARATOR); |
1311 | 259 | } |
1312 | 297 | bit &= ~(1ULL << (col_pos - 1)); |
1313 | 297 | col_pos = __builtin_ffsll(bit); |
1314 | 297 | } |
1315 | | // remove the last ',' |
1316 | 155 | if (!data.empty()) { |
1317 | 140 | data.pop_back(); |
1318 | 140 | } |
1319 | 155 | res_col->insert_data(reinterpret_cast<const char*>(data.data()), data.size()); |
1320 | 155 | } |
1321 | 35 | } |
1322 | | }; |
1323 | | |
1324 | | class FunctionExportSet : public IFunction { |
1325 | | public: |
1326 | | static constexpr auto name = "export_set"; |
1327 | 78 | static FunctionPtr create() { return std::make_shared<FunctionExportSet>(); } |
1328 | 0 | String get_name() const override { return name; } |
1329 | 0 | size_t get_number_of_arguments() const override { return 0; } |
1330 | 70 | bool is_variadic() const override { return true; } |
1331 | 69 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
1332 | 69 | return std::make_shared<DataTypeString>(); |
1333 | 69 | } |
1334 | | |
1335 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
1336 | 69 | uint32_t result, size_t input_rows_count) const override { |
1337 | 69 | auto res_col = ColumnString::create(); |
1338 | | |
1339 | 69 | const size_t arg_size = arguments.size(); |
1340 | 69 | bool col_const[5]; |
1341 | 69 | ColumnPtr arg_cols[5]; |
1342 | 69 | bool all_const = true; |
1343 | 244 | for (int i = 1; i < arg_size; ++i) { |
1344 | 175 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
1345 | 175 | all_const = all_const && col_const[i]; |
1346 | 175 | } |
1347 | 69 | std::tie(arg_cols[0], col_const[0]) = |
1348 | 69 | unpack_if_const(block.get_by_position(arguments[0]).column); |
1349 | 69 | if (arg_size == 3) { |
1350 | 49 | default_preprocess_parameter_columns(arg_cols, col_const, {1, 2}, block, arguments); |
1351 | 49 | } else if (arg_size == 4) { |
1352 | 3 | default_preprocess_parameter_columns(arg_cols, col_const, {1, 2, 3}, block, arguments); |
1353 | 17 | } else if (arg_size == 5) { |
1354 | 17 | default_preprocess_parameter_columns(arg_cols, col_const, {1, 2, 3, 4}, block, |
1355 | 17 | arguments); |
1356 | 17 | } |
1357 | | |
1358 | 69 | const auto* bit_col = assert_cast<const ColumnInt128*>(arg_cols[0].get()); |
1359 | 69 | const auto* on_col = assert_cast<const ColumnString*>(arg_cols[1].get()); |
1360 | 69 | const auto* off_col = assert_cast<const ColumnString*>(arg_cols[2].get()); |
1361 | 69 | const ColumnString* sep_col = nullptr; |
1362 | 69 | const ColumnInt32* num_bits_col = nullptr; |
1363 | 69 | if (arg_size > 3) { |
1364 | 20 | sep_col = assert_cast<const ColumnString*>(arg_cols[3].get()); |
1365 | 20 | if (arg_size == 5) { |
1366 | 17 | num_bits_col = assert_cast<const ColumnInt32*>(arg_cols[4].get()); |
1367 | 17 | } |
1368 | 20 | } |
1369 | | |
1370 | 246 | for (size_t i = 0; i < input_rows_count; ++i) { |
1371 | 177 | uint64_t bit = |
1372 | 177 | check_and_get_bit(bit_col->get_element(index_check_const(i, col_const[0]))); |
1373 | | |
1374 | 177 | size_t idx_for_args = all_const ? 0 : i; |
1375 | 177 | StringRef on = on_col->get_data_at(idx_for_args); |
1376 | 177 | StringRef off = off_col->get_data_at(idx_for_args); |
1377 | 177 | StringRef separator(",", 1); |
1378 | 177 | int8_t num_of_bits = 64; |
1379 | | |
1380 | 177 | if (arg_size > 3) { |
1381 | 104 | separator = sep_col->get_data_at(idx_for_args); |
1382 | 104 | if (arg_size == 5) { |
1383 | 77 | num_of_bits = |
1384 | 77 | check_and_get_num_of_bits(num_bits_col->get_element(idx_for_args)); |
1385 | 77 | } |
1386 | 104 | } |
1387 | | |
1388 | 177 | execute_single(bit, on, off, separator, num_of_bits, *res_col); |
1389 | 177 | } |
1390 | 69 | block.replace_by_position(result, std::move(res_col)); |
1391 | 69 | return Status::OK(); |
1392 | 69 | } |
1393 | | |
1394 | | private: |
1395 | | /* The valid range of the input `bit` parameter should be [-2^63, 2^64 - 1] |
1396 | | * If it exceeds this range, the MAX/MIN values of the signed 64-bit integer are used for calculation |
1397 | | * This behavior is consistent with MySQL. |
1398 | | */ |
1399 | 177 | uint64_t check_and_get_bit(__int128 col_bit_val) const { |
1400 | 177 | if (col_bit_val > ULLONG_MAX) { |
1401 | 3 | return LLONG_MAX; |
1402 | 174 | } else if (col_bit_val < LLONG_MIN) { |
1403 | 1 | return LLONG_MIN; |
1404 | 1 | } |
1405 | 173 | return static_cast<uint64_t>(col_bit_val); |
1406 | 177 | } |
1407 | | |
1408 | | // If the input value is not in the range [0, 64], return default value 64 |
1409 | 77 | int8_t check_and_get_num_of_bits(int32_t col_num_of_bits_val) const { |
1410 | 77 | if (col_num_of_bits_val >= 0 && col_num_of_bits_val <= 64) { |
1411 | 71 | return static_cast<int8_t>(col_num_of_bits_val); |
1412 | 71 | } |
1413 | 6 | return 64; |
1414 | 77 | } |
1415 | | |
1416 | | void execute_single(uint64_t bit, const StringRef& on, const StringRef& off, |
1417 | | const StringRef& separator, int8_t num_of_bits, |
1418 | 177 | ColumnString& res_col) const { |
1419 | 177 | ColumnString::Chars data; |
1420 | 177 | data.reserve(std::max(on.size, off.size) * num_of_bits + |
1421 | 177 | separator.size * (num_of_bits - 1)); |
1422 | | |
1423 | 5.03k | while (bit && num_of_bits) { |
1424 | 4.86k | if (bit & 1) { |
1425 | 3.04k | data.insert(on.data, on.data + on.size); |
1426 | 3.04k | } else { |
1427 | 1.82k | data.insert(off.data, off.data + off.size); |
1428 | 1.82k | } |
1429 | 4.86k | bit >>= 1; |
1430 | 4.86k | if (--num_of_bits) { |
1431 | 4.79k | data.insert(separator.data, separator.data + separator.size); |
1432 | 4.79k | } |
1433 | 4.86k | } |
1434 | | |
1435 | 177 | if (num_of_bits > 0) { |
1436 | 111 | ColumnString::Chars off_sep_combo; |
1437 | 111 | off_sep_combo.reserve(separator.size + off.size); |
1438 | 111 | off_sep_combo.insert(off_sep_combo.end(), off.data, off.data + off.size); |
1439 | 111 | off_sep_combo.insert(off_sep_combo.end(), separator.data, |
1440 | 111 | separator.data + separator.size); |
1441 | | |
1442 | 3.30k | for (size_t i = 0; i < num_of_bits; ++i) { |
1443 | 3.19k | data.insert(off_sep_combo.data(), off_sep_combo.data() + off_sep_combo.size()); |
1444 | 3.19k | } |
1445 | 111 | data.erase(data.end() - separator.size, data.end()); |
1446 | 111 | } |
1447 | | |
1448 | 177 | res_col.insert_data(reinterpret_cast<const char*>(data.data()), data.size()); |
1449 | 177 | } |
1450 | | }; |
1451 | | |
1452 | | // ATTN: for debug only |
1453 | | // compute crc32 hash value as the same way in `VOlapTablePartitionParam::find_tablets()` |
1454 | | class FunctionCrc32Internal : public IFunction { |
1455 | | public: |
1456 | | static constexpr auto name = "crc32_internal"; |
1457 | 44.1k | static FunctionPtr create() { return std::make_shared<FunctionCrc32Internal>(); } |
1458 | 0 | String get_name() const override { return name; } |
1459 | 0 | size_t get_number_of_arguments() const override { return 0; } |
1460 | 44.0k | bool is_variadic() const override { return true; } |
1461 | 59.2k | bool use_default_implementation_for_nulls() const override { return false; } |
1462 | 44.0k | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
1463 | 44.0k | return std::make_shared<DataTypeInt64>(); |
1464 | 44.0k | } |
1465 | | |
1466 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
1467 | 15.0k | uint32_t result, size_t input_rows_count) const override { |
1468 | 15.0k | DCHECK_GE(arguments.size(), 1); |
1469 | | |
1470 | 15.0k | auto argument_size = arguments.size(); |
1471 | 15.0k | std::vector<ColumnPtr> argument_columns(argument_size); |
1472 | 15.0k | std::vector<PrimitiveType> argument_primitive_types(argument_size); |
1473 | | |
1474 | 30.4k | for (size_t i = 0; i < argument_size; ++i) { |
1475 | 15.3k | argument_columns[i] = |
1476 | 15.3k | block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); |
1477 | 15.3k | argument_primitive_types[i] = |
1478 | 15.3k | block.get_by_position(arguments[i]).type->get_primitive_type(); |
1479 | 15.3k | } |
1480 | | |
1481 | 15.0k | auto res_col = ColumnInt64::create(); |
1482 | 15.0k | auto& res_data = res_col->get_data(); |
1483 | 15.0k | res_data.resize_fill(input_rows_count, 0); |
1484 | | |
1485 | 14.9M | for (size_t i = 0; i < input_rows_count; ++i) { |
1486 | 14.9M | uint32_t hash_val = 0; |
1487 | 29.8M | for (size_t j = 0; j < argument_size; ++j) { |
1488 | 14.9M | const auto& column = argument_columns[j]; |
1489 | 14.9M | auto primitive_type = argument_primitive_types[j]; |
1490 | 14.9M | auto val = column->get_data_at(i); |
1491 | 14.9M | if (val.data != nullptr) { |
1492 | 14.9M | hash_val = RawValue::zlib_crc32(val.data, val.size, primitive_type, hash_val); |
1493 | 14.9M | } else { |
1494 | 4.96k | hash_val = HashUtil::zlib_crc_hash_null(hash_val); |
1495 | 4.96k | } |
1496 | 14.9M | } |
1497 | 14.9M | res_data[i] = hash_val; |
1498 | 14.9M | } |
1499 | | |
1500 | 15.0k | block.replace_by_position(result, std::move(res_col)); |
1501 | 15.0k | return Status::OK(); |
1502 | 15.0k | } |
1503 | | }; |
1504 | | |
1505 | | class FunctionUnicodeNormalize : public IFunction { |
1506 | | public: |
1507 | | static constexpr auto name = "unicode_normalize"; |
1508 | | |
1509 | 24 | static FunctionPtr create() { return std::make_shared<FunctionUnicodeNormalize>(); } |
1510 | | |
1511 | 5 | String get_name() const override { return name; } |
1512 | | |
1513 | 15 | size_t get_number_of_arguments() const override { return 2; } |
1514 | | |
1515 | 15 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
1516 | 15 | if (arguments.size() != 2 || !is_string_type(arguments[0]->get_primitive_type()) || |
1517 | 15 | !is_string_type(arguments[1]->get_primitive_type())) { |
1518 | 0 | throw doris::Exception(ErrorCode::INVALID_ARGUMENT, |
1519 | 0 | "Illegal type {} and {} of arguments of function {}", |
1520 | 0 | arguments[0]->get_name(), arguments[1]->get_name(), get_name()); |
1521 | 0 | } |
1522 | 15 | return arguments[0]; |
1523 | 15 | } |
1524 | | |
1525 | 16 | ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; } |
1526 | | |
1527 | 32 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
1528 | 32 | if (scope == FunctionContext::THREAD_LOCAL) { |
1529 | 17 | return Status::OK(); |
1530 | 17 | } |
1531 | | |
1532 | 15 | if (!context->is_col_constant(1)) { |
1533 | 2 | return Status::InvalidArgument( |
1534 | 2 | "The second argument 'mode' of function {} must be constant", get_name()); |
1535 | 2 | } |
1536 | | |
1537 | 13 | auto* const_col = context->get_constant_col(1); |
1538 | 13 | auto mode_ref = const_col->column_ptr->get_data_at(0); |
1539 | 13 | std::string lower_mode = doris::to_lower(std::string(doris::trim(mode_ref.to_string()))); |
1540 | | |
1541 | 13 | UErrorCode status = U_ZERO_ERROR; |
1542 | 13 | const icu::Normalizer2* normalizer = nullptr; |
1543 | | |
1544 | 13 | if (lower_mode == "nfc") { |
1545 | 5 | normalizer = icu::Normalizer2::getInstance(nullptr, "nfc", UNORM2_COMPOSE, status); |
1546 | 8 | } else if (lower_mode == "nfd") { |
1547 | 2 | normalizer = icu::Normalizer2::getNFDInstance(status); |
1548 | 6 | } else if (lower_mode == "nfkc") { |
1549 | 0 | normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status); |
1550 | 6 | } else if (lower_mode == "nfkd") { |
1551 | 2 | normalizer = icu::Normalizer2::getNFKDInstance(status); |
1552 | 4 | } else if (lower_mode == "nfkc_cf") { |
1553 | 2 | normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, status); |
1554 | 2 | } else { |
1555 | 2 | return Status::InvalidArgument( |
1556 | 2 | "Invalid normalization mode '{}' for function {}. " |
1557 | 2 | "Supported modes: NFC, NFD, NFKC, NFKD, NFKC_CF", |
1558 | 2 | lower_mode, get_name()); |
1559 | 2 | } |
1560 | | |
1561 | 11 | if (U_FAILURE(status) || normalizer == nullptr) { |
1562 | 0 | return Status::InvalidArgument( |
1563 | 0 | "Failed to get normalizer instance for mode '{}' in function {}: {}", |
1564 | 0 | lower_mode, get_name(), u_errorName(status)); |
1565 | 0 | } |
1566 | | |
1567 | 11 | auto state = std::make_shared<UnicodeNormalizeState>(); |
1568 | 11 | state->normalizer = normalizer; |
1569 | 11 | context->set_function_state(scope, state); |
1570 | 11 | return Status::OK(); |
1571 | 11 | } |
1572 | | |
1573 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
1574 | 11 | uint32_t result, size_t input_rows_count) const override { |
1575 | 11 | auto* state = reinterpret_cast<UnicodeNormalizeState*>( |
1576 | 11 | context->get_function_state(FunctionContext::FRAGMENT_LOCAL)); |
1577 | 11 | if (state == nullptr || state->normalizer == nullptr) { |
1578 | 0 | return Status::RuntimeError("unicode_normalize function state is not initialized"); |
1579 | 0 | } |
1580 | | |
1581 | 11 | ColumnPtr col = |
1582 | 11 | block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); |
1583 | 11 | const auto* col_str = check_and_get_column<ColumnString>(col.get()); |
1584 | 11 | if (col_str == nullptr) { |
1585 | 0 | return Status::RuntimeError("Illegal column {} of argument of function {}", |
1586 | 0 | block.get_by_position(arguments[0]).column->get_name(), |
1587 | 0 | get_name()); |
1588 | 0 | } |
1589 | | |
1590 | 11 | const auto& data = col_str->get_chars(); |
1591 | 11 | const auto& offsets = col_str->get_offsets(); |
1592 | | |
1593 | 11 | auto res = ColumnString::create(); |
1594 | 11 | auto& res_data = res->get_chars(); |
1595 | 11 | auto& res_offsets = res->get_offsets(); |
1596 | | |
1597 | 11 | size_t rows = offsets.size(); |
1598 | 11 | res_offsets.resize(rows); |
1599 | | |
1600 | 11 | std::string tmp; |
1601 | 22 | for (size_t i = 0; i < rows; ++i) { |
1602 | 11 | const char* begin = reinterpret_cast<const char*>(&data[offsets[i - 1]]); |
1603 | 11 | size_t len = offsets[i] - offsets[i - 1]; |
1604 | | |
1605 | 11 | normalize_one(state->normalizer, begin, len, tmp); |
1606 | 11 | StringOP::push_value_string(tmp, i, res_data, res_offsets); |
1607 | 11 | } |
1608 | | |
1609 | 11 | block.replace_by_position(result, std::move(res)); |
1610 | 11 | return Status::OK(); |
1611 | 11 | } |
1612 | | |
1613 | | private: |
1614 | | struct UnicodeNormalizeState { |
1615 | | const icu::Normalizer2* normalizer = nullptr; |
1616 | | }; |
1617 | | |
1618 | | static void normalize_one(const icu::Normalizer2* normalizer, const char* input, size_t length, |
1619 | 11 | std::string& output) { |
1620 | 11 | if (length == 0) { |
1621 | 1 | output.clear(); |
1622 | 1 | return; |
1623 | 1 | } |
1624 | | |
1625 | 10 | icu::StringPiece sp(input, static_cast<int32_t>(length)); |
1626 | 10 | icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(sp); |
1627 | | |
1628 | 10 | UErrorCode status = U_ZERO_ERROR; |
1629 | 10 | UNormalizationCheckResult quick = normalizer->quickCheck(src16, status); |
1630 | 10 | if (U_SUCCESS(status) && quick == UNORM_YES) { |
1631 | 4 | output.assign(input, length); |
1632 | 4 | return; |
1633 | 4 | } |
1634 | | |
1635 | 6 | icu::UnicodeString result16; |
1636 | 6 | status = U_ZERO_ERROR; |
1637 | 6 | normalizer->normalize(src16, result16, status); |
1638 | 6 | if (U_FAILURE(status)) { |
1639 | 0 | output.assign(input, length); |
1640 | 0 | return; |
1641 | 0 | } |
1642 | | |
1643 | 6 | output.clear(); |
1644 | 6 | result16.toUTF8String(output); |
1645 | 6 | } |
1646 | | }; |
1647 | | |
1648 | | using FunctionMakeSet = FunctionNeedsToHandleNull<MakeSetImpl, PrimitiveType::TYPE_STRING>; |
1649 | | |
1650 | 8 | void register_function_string_misc(SimpleFunctionFactory& factory) { |
1651 | 8 | factory.register_function<FunctionAutoPartitionName>(); |
1652 | 8 | factory.register_function<FunctionConvertTo>(); |
1653 | 8 | factory.register_function<FunctionIntToChar>(); |
1654 | 8 | factory.register_function<FunctionRandomBytes>(); |
1655 | 8 | factory.register_function<FunctionTranslate>(); |
1656 | 8 | factory.register_function<FunctionNgramSearch>(); |
1657 | 8 | factory.register_function<FunctionXPathString>(); |
1658 | 8 | factory.register_function<FunctionCrc32Internal>(); |
1659 | 8 | factory.register_function<FunctionMakeSet>(); |
1660 | 8 | factory.register_function<FunctionExportSet>(); |
1661 | 8 | factory.register_function<FunctionUnicodeNormalize>(); |
1662 | 8 | } |
1663 | | |
1664 | | #include "common/compile_check_avoid_end.h" |
1665 | | } // namespace doris |