Coverage Report

Created: 2026-04-16 14:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_string_misc.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <crc32c/crc32c.h>
19
#include <fmt/format.h>
20
#include <glog/logging.h>
21
#include <unicode/normalizer2.h>
22
#include <unicode/stringpiece.h>
23
#include <unicode/unistr.h>
24
25
#include <algorithm>
26
#include <bit>
27
#include <boost/locale.hpp>
28
#include <climits>
29
#include <cstddef>
30
#include <cstdint>
31
#include <cstdlib>
32
#include <cstring>
33
#include <format>
34
#include <iomanip>
35
#include <memory>
36
#include <random>
37
#include <sstream>
38
#include <string>
39
#include <string_view>
40
#include <unordered_map>
41
#include <utility>
42
#include <vector>
43
44
#include "common/compiler_util.h"
45
#include "common/exception.h"
46
#include "common/status.h"
47
#include "core/assert_cast.h"
48
#include "core/block/block.h"
49
#include "core/block/column_numbers.h"
50
#include "core/block/column_with_type_and_name.h"
51
#include "core/column/column.h"
52
#include "core/column/column_const.h"
53
#include "core/column/column_nullable.h"
54
#include "core/column/column_string.h"
55
#include "core/column/column_vector.h"
56
#include "core/data_type/data_type.h"
57
#include "core/data_type/data_type_nullable.h"
58
#include "core/data_type/data_type_number.h"
59
#include "core/data_type/data_type_string.h"
60
#include "core/data_type/define_primitive_type.h"
61
#include "core/memcpy_small.h"
62
#include "core/pod_array.h"
63
#include "core/string_ref.h"
64
#include "core/types.h"
65
#include "exec/common/hash_table/phmap_fwd_decl.h"
66
#include "exec/common/pinyin.h"
67
#include "exec/common/stringop_substring.h"
68
#include "exec/common/template_helpers.hpp"
69
#include "exprs/function/function.h"
70
#include "exprs/function/function_helpers.h"
71
#include "exprs/function/function_needs_to_handle_null.h"
72
#include "exprs/function_context.h"
73
#include "pugixml.hpp"
74
#include "util/hash_util.hpp"
75
#include "util/raw_value.h"
76
#include "util/simd/vstring_function.h"
77
#include "util/string_util.h"
78
#include "util/utf8_check.h"
79
80
#ifndef USE_LIBCPP
81
#include <memory_resource>
82
#define PMR std::pmr
83
#else
84
#include <boost/container/pmr/monotonic_buffer_resource.hpp>
85
#include <boost/container/pmr/vector.hpp>
86
#define PMR boost::container::pmr
87
#endif
88
89
#include "exprs/function/simple_function_factory.h"
90
91
namespace doris {
92
#include "common/compile_check_avoid_begin.h"
93
94
class FunctionAutoPartitionName : public IFunction {
95
public:
96
    static constexpr auto name = "auto_partition_name";
97
2
    static FunctionPtr create() { return std::make_shared<FunctionAutoPartitionName>(); }
98
0
    String get_name() const override { return name; }
99
0
    size_t get_number_of_arguments() const override { return 0; }
100
1
    bool is_variadic() const override { return true; }
101
0
    bool use_default_implementation_for_nulls() const override { return false; }
102
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
103
0
        return std::make_shared<DataTypeString>();
104
0
    }
105
106
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
107
0
                        uint32_t result, size_t input_rows_count) const override {
108
0
        size_t argument_size = arguments.size();
109
0
        auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
110
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
111
0
        std::vector<const ColumnString::Chars*> chars_list(argument_size);
112
0
        std::vector<const ColumnString::Offsets*> offsets_list(argument_size);
113
0
        std::vector<bool> is_const_args(argument_size);
114
0
        std::vector<const ColumnUInt8::Container*> null_list(argument_size);
115
0
        std::vector<ColumnPtr> argument_null_columns(argument_size);
116
117
0
        std::vector<ColumnPtr> argument_columns(argument_size);
118
0
        for (int i = 0; i < argument_size; ++i) {
119
0
            argument_columns[i] =
120
0
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
121
0
            if (const auto* nullable =
122
0
                        check_and_get_column<const ColumnNullable>(*argument_columns[i])) {
123
0
                null_list[i] = &nullable->get_null_map_data();
124
0
                argument_null_columns[i] = nullable->get_null_map_column_ptr();
125
0
                argument_columns[i] = nullable->get_nested_column_ptr();
126
0
            } else {
127
0
                null_list[i] = &const_null_map->get_data();
128
0
            }
129
130
0
            const auto& [col, is_const] =
131
0
                    unpack_if_const(block.get_by_position(arguments[i]).column);
132
133
0
            const auto* col_str = assert_cast<const ColumnString*>(argument_columns[i].get());
134
0
            chars_list[i] = &col_str->get_chars();
135
0
            offsets_list[i] = &col_str->get_offsets();
136
0
            is_const_args[i] = is_const;
137
0
        }
138
139
0
        auto res = ColumnString::create();
140
0
        auto& res_data = res->get_chars();
141
0
        auto& res_offset = res->get_offsets();
142
0
        res_offset.resize(input_rows_count);
143
144
0
        const char* partition_type = chars_list[0]->raw_data();
145
        // partition type is list|range
146
0
        if (std::strncmp(partition_type, "list", 4) == 0) {
147
0
            return _auto_partition_type_of_list(chars_list, offsets_list, is_const_args, null_list,
148
0
                                                res_data, res_offset, input_rows_count,
149
0
                                                argument_size, block, result, res);
150
0
        } else {
151
0
            return _auto_partition_type_of_range(chars_list, offsets_list, is_const_args, res_data,
152
0
                                                 res_offset, input_rows_count, argument_size, block,
153
0
                                                 result, res);
154
0
        }
155
0
        return Status::OK();
156
0
    }
157
158
private:
159
0
    std::u16string _string_to_u16string(const std::string& str) const {
160
0
        return boost::locale::conv::utf_to_utf<char16_t>(str);
161
0
    }
162
163
0
    std::string _string_to_unicode(const std::u16string& s) const {
164
0
        std::string res_s;
165
0
        res_s.reserve(s.size());
166
0
        if (s.length() > 0 && s[0] == '-') {
167
0
            res_s += '_';
168
0
        }
169
0
        for (int i = 0; i < s.length(); i++) {
170
0
            char16_t ch = s[i];
171
0
            if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) {
172
0
                res_s += ch;
173
0
            } else {
174
0
                int unicodeValue = _get_code_point_at(s, i);
175
0
                res_s += fmt::format("{:02x}", static_cast<uint32_t>(unicodeValue));
176
0
            }
177
0
        }
178
0
        return res_s;
179
0
    }
180
181
0
    int _get_code_point_at(const std::u16string& str, std::size_t index) const {
182
0
        char16_t first = str[index];
183
        // [0xD800,0xDBFF] is the scope of the first code unit
184
0
        if ((first >= 0xD800 && first <= 0xDBFF) && (index + 1 < str.size())) {
185
0
            char16_t second = str[index + 1];
186
            // [0xDC00,0xDFFF] is the scope of the second code unit
187
0
            if (second >= 0xDC00 && second <= 0xDFFF) {
188
0
                return ((first - 0xD800) << 10) + (second - 0xDC00) + 0x10000;
189
0
            }
190
0
        }
191
192
0
        return first;
193
0
    }
194
    Status _auto_partition_type_of_list(std::vector<const ColumnString::Chars*>& chars_list,
195
                                        std::vector<const ColumnString::Offsets*>& offsets_list,
196
                                        std::vector<bool>& is_const_args,
197
                                        const std::vector<const ColumnUInt8::Container*>& null_list,
198
                                        auto& res_data, auto& res_offset, size_t input_rows_count,
199
                                        size_t argument_size, Block& block, uint32_t result,
200
0
                                        auto& res) const {
201
0
        int curr_len = 0;
202
0
        for (int row = 0; row < input_rows_count; row++) {
203
0
            std::string res_p;
204
0
            res_p.reserve(argument_size * 5);
205
0
            res_p += 'p';
206
0
            for (int col = 1; col < argument_size; col++) {
207
0
                const auto& current_offsets = *offsets_list[col];
208
0
                const auto& current_chars = *chars_list[col];
209
0
                const auto& current_nullmap = *null_list[col];
210
211
0
                if (current_nullmap[row]) {
212
0
                    res_p += 'X';
213
0
                } else {
214
0
                    auto idx = index_check_const(row, is_const_args[col]);
215
216
0
                    int size = current_offsets[idx] - current_offsets[idx - 1];
217
0
                    const char* raw_chars =
218
0
                            reinterpret_cast<const char*>(&current_chars[current_offsets[idx - 1]]);
219
                    // convert string to u16string in order to convert to unicode strings
220
0
                    const std::string raw_str(raw_chars, size);
221
0
                    auto u16string = _string_to_u16string(raw_str);
222
0
                    res_p += _string_to_unicode(u16string) + std::to_string(u16string.size());
223
0
                }
224
0
            }
225
226
            // check the name of length
227
0
            int len = res_p.size();
228
0
            if (len > 50) {
229
0
                res_p = std::format("{}_{:08x}", res_p.substr(0, 50), to_hash_code(res_p));
230
0
                len = res_p.size();
231
0
            }
232
0
            curr_len += len;
233
0
            res_data.resize(curr_len);
234
0
            memcpy(&res_data[res_offset[row - 1]], res_p.c_str(), len);
235
0
            res_offset[row] = res_offset[row - 1] + len;
236
0
        }
237
0
        block.get_by_position(result).column = std::move(res);
238
0
        return Status::OK();
239
0
    }
240
241
    size_t _copy_date_str_of_len_to_res_data(auto& res_data, auto& res_offset,
242
                                             std::vector<std::string>& date_str, size_t row,
243
0
                                             size_t len) const {
244
0
        size_t curr_len = 1;
245
0
        for (int j = 0; j < len; j++) {
246
0
            memcpy(&res_data[res_offset[row - 1]] + curr_len, date_str[j].c_str(),
247
0
                   date_str[j].size());
248
0
            curr_len += date_str[j].size();
249
0
        }
250
0
        return curr_len;
251
0
    }
252
253
    Status _auto_partition_type_of_range(std::vector<const ColumnString::Chars*>& chars_list,
254
                                         std::vector<const ColumnString::Offsets*>& offsets_list,
255
                                         std::vector<bool>& is_const_args, auto& res_data,
256
                                         auto& res_offset, size_t input_rows_count,
257
                                         size_t argument_size, Block& block, uint32_t result,
258
0
                                         auto& res) const {
259
0
        const char* range_type = chars_list[1]->raw_data();
260
261
0
        res_data.resize(15 * input_rows_count);
262
0
        for (int i = 0; i < input_rows_count; i++) {
263
0
            const auto& current_offsets = *offsets_list[2];
264
0
            const auto& current_chars = *chars_list[2];
265
266
0
            auto idx = index_check_const(i, is_const_args[2]);
267
0
            int size = current_offsets[idx] - current_offsets[idx - 1];
268
0
            const char* tmp =
269
0
                    reinterpret_cast<const char*>(&current_chars[current_offsets[idx - 1]]);
270
0
            std::string to_split_s(tmp, size);
271
272
            // check the str if it is date|datetime
273
0
            RE2 date_regex(R"(^\d{4}-\d{2}-\d{2}( \d{2}:\d{2}:\d{2})?$)");
274
0
            if (!RE2::FullMatch(to_split_s, date_regex)) {
275
0
                return Status::InvalidArgument("The range partition only support DATE|DATETIME");
276
0
            }
277
278
            // split date_str from (yyyy-mm-dd hh:mm:ss) to ([yyyy, mm, dd, hh, mm, ss])
279
0
            std::vector<std::string> date_str(6);
280
0
            date_str[0] = to_split_s.substr(0, 4);
281
0
            for (int ni = 5, j = 1; ni <= size; ni += 3, j++) {
282
0
                date_str[j] = to_split_s.substr(ni, 2);
283
0
            }
284
0
            int curr_len = 0;
285
286
0
            res_data[res_offset[i - 1]] = 'p';
287
            // raw => 2022-12-12 11:30:20
288
            // year => 2022 01 01 00 00 00
289
            // month => 2022 12 01 00 00 00
290
            // day => 2022 12 12 00 00 00
291
            // hour => 2022 12 12 11 00 00
292
            // minute => 2022 12  11 30 00
293
            // second => 2022 12 12 12 30 20
294
295
0
            if (!strncmp(range_type, "year", 4)) {
296
0
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 1);
297
0
                memcpy(&res_data[res_offset[i - 1]] + curr_len, "0101", 4);
298
0
                curr_len += 4;
299
0
            } else if (!strncmp(range_type, "month", 5)) {
300
0
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 2);
301
0
                memcpy(&res_data[res_offset[i - 1]] + curr_len, "01", 2);
302
0
                curr_len += 2;
303
0
            } else if (!strncmp(range_type, "day", 3)) {
304
0
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 3);
305
0
            } else if (!strncmp(range_type, "hour", 4)) {
306
0
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 4);
307
0
            } else if (!strncmp(range_type, "minute", 6)) {
308
0
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 5);
309
0
            } else if (!strncmp(range_type, "second", 6)) {
310
0
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 6);
311
0
            }
312
313
            // fill in zero
314
0
            int zero = 15 - curr_len;
315
0
            std::fill_n(&res_data[res_offset[i - 1]] + curr_len, zero, '0');
316
0
            curr_len += zero;
317
0
            res_offset[i] = res_offset[i - 1] + curr_len;
318
0
        }
319
0
        block.get_by_position(result).column = std::move(res);
320
0
        return Status::OK();
321
0
    }
322
323
0
    int32_t to_hash_code(const std::string& str) const {
324
0
        uint64_t h = 0;
325
0
        for (uint8_t c : str) {
326
0
            h = (h * 31U + c) & 0xFFFFFFFFU;
327
0
        }
328
0
        return static_cast<int32_t>(h);
329
0
    }
330
};
331
332
class FunctionRandomBytes : public IFunction {
333
public:
334
    static constexpr auto name = "random_bytes";
335
2
    static FunctionPtr create() { return std::make_shared<FunctionRandomBytes>(); }
336
1
    String get_name() const override { return name; }
337
0
    size_t get_number_of_arguments() const override { return 1; }
338
1
    bool is_variadic() const override { return false; }
339
340
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
341
0
        return std::make_shared<DataTypeString>();
342
0
    }
343
344
0
    bool use_default_implementation_for_constants() const final { return false; }
345
346
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
347
0
                        uint32_t result, size_t input_rows_count) const override {
348
0
        auto res = ColumnString::create();
349
0
        auto& res_offsets = res->get_offsets();
350
0
        auto& res_chars = res->get_chars();
351
0
        res_offsets.resize(input_rows_count);
352
353
0
        auto [arg_col, arg_const] = unpack_if_const(block.get_by_position(arguments[0]).column);
354
0
        const auto* length_col = assert_cast<const ColumnInt32*>(arg_col.get());
355
356
0
        if (arg_const) {
357
0
            res_chars.reserve(input_rows_count * (length_col->get_element(0) + 2));
358
0
        }
359
360
0
        std::vector<uint8_t, Allocator_<uint8_t>> random_bytes;
361
0
        std::random_device rd;
362
0
        std::mt19937 gen(rd());
363
364
0
        std::uniform_int_distribution<unsigned short> distribution(0, 255);
365
0
        for (size_t i = 0; i < input_rows_count; ++i) {
366
0
            size_t index = index_check_const(i, arg_const);
367
0
            if (length_col->get_element(index) < 0) [[unlikely]] {
368
0
                return Status::InvalidArgument("argument {} of function {} at row {} was invalid.",
369
0
                                               length_col->get_element(index), name, index);
370
0
            }
371
0
            random_bytes.resize(length_col->get_element(index));
372
373
0
            for (auto& byte : random_bytes) {
374
0
                byte = distribution(gen) & 0xFF;
375
0
            }
376
377
0
            std::basic_ostringstream<char, std::char_traits<char>, Allocator_<char>> oss;
378
0
            for (const auto& byte : random_bytes) {
379
0
                oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(byte);
380
0
            }
381
382
0
            StringOP::push_value_string("0x" + oss.str(), i, res_chars, res_offsets);
383
0
            random_bytes.clear();
384
0
        }
385
386
0
        block.get_by_position(result).column = std::move(res);
387
388
0
        return Status::OK();
389
0
    }
390
};
391
392
class FunctionConvertTo : public IFunction {
393
public:
394
    static constexpr auto name = "convert_to";
395
396
2
    static FunctionPtr create() { return std::make_shared<FunctionConvertTo>(); }
397
398
1
    String get_name() const override { return name; }
399
400
0
    size_t get_number_of_arguments() const override { return 2; }
401
402
0
    DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override {
403
0
        return std::make_shared<DataTypeString>();
404
0
    }
405
406
0
    Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
407
0
        if (scope != FunctionContext::THREAD_LOCAL) {
408
0
            return Status::OK();
409
0
        }
410
0
        if (!context->is_col_constant(1)) {
411
0
            return Status::InvalidArgument(
412
0
                    "character argument to convert function must be constant.");
413
0
        }
414
0
        const auto& character_data = context->get_constant_col(1)->column_ptr->get_data_at(0);
415
0
        if (!iequal(character_data.to_string(), "gbk")) {
416
0
            return Status::RuntimeError(
417
0
                    "Illegal second argument column of function convert. now only support "
418
0
                    "convert to character set of gbk");
419
0
        }
420
421
0
        return Status::OK();
422
0
    }
423
424
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
425
0
                        uint32_t result, size_t input_rows_count) const override {
426
0
        ColumnPtr argument_column =
427
0
                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
428
0
        const ColumnString* str_col = static_cast<const ColumnString*>(argument_column.get());
429
0
        const auto& str_offset = str_col->get_offsets();
430
0
        const auto& str_chars = str_col->get_chars();
431
0
        auto col_res = ColumnString::create();
432
0
        auto& res_offset = col_res->get_offsets();
433
0
        auto& res_chars = col_res->get_chars();
434
0
        res_offset.resize(input_rows_count);
435
        // max pinyin size is 6 + 1 (first '~') for utf8 chinese word 3
436
0
        size_t pinyin_size = (str_chars.size() + 2) / 3 * 7;
437
0
        ColumnString::check_chars_length(pinyin_size, 0);
438
0
        res_chars.resize(pinyin_size);
439
440
0
        size_t in_len = 0, out_len = 0;
441
0
        for (int i = 0; i < input_rows_count; ++i) {
442
0
            in_len = str_offset[i] - str_offset[i - 1];
443
0
            const char* in = reinterpret_cast<const char*>(&str_chars[str_offset[i - 1]]);
444
0
            char* out = reinterpret_cast<char*>(&res_chars[res_offset[i - 1]]);
445
0
            _utf8_to_pinyin(in, in_len, out, &out_len);
446
0
            res_offset[i] = res_offset[i - 1] + out_len;
447
0
        }
448
0
        res_chars.resize(res_offset[input_rows_count - 1]);
449
0
        block.replace_by_position(result, std::move(col_res));
450
0
        return Status::OK();
451
0
    }
452
453
0
    void _utf8_to_pinyin(const char* in, size_t in_len, char* out, size_t* out_len) const {
454
0
        auto do_memcpy = [](char*& dest, const char*& from, size_t size) {
455
0
            memcpy_small_allow_read_write_overflow15(dest, from, size);
456
0
            dest += size;
457
0
            from += size;
458
0
        };
459
0
        auto from = in;
460
0
        auto dest = out;
461
462
0
        while (from - in < in_len) {
463
0
            auto length = get_utf8_byte_length(*from);
464
0
            if (length != 3) {
465
0
                do_memcpy(dest, from, length);
466
0
            } else {
467
                // convert utf8 to unicode code to get pinyin offset
468
0
                if (auto tmp = (((int)(*from & 0x0F)) << 12) | (((int)(*(from + 1) & 0x3F)) << 6) |
469
0
                               (*(from + 2) & 0x3F);
470
0
                    tmp >= START_UNICODE_OFFSET and tmp < END_UNICODE_OFFSET) {
471
0
                    const char* buf = nullptr;
472
0
                    if (tmp >= START_UNICODE_OFFSET && tmp < MID_UNICODE_OFFSET) {
473
0
                        buf = PINYIN_DICT1 + (tmp - START_UNICODE_OFFSET) * MAX_PINYIN_LEN;
474
0
                    } else if (tmp >= MID_UNICODE_OFFSET && tmp < END_UNICODE_OFFSET) {
475
0
                        buf = PINYIN_DICT2 + (tmp - MID_UNICODE_OFFSET) * MAX_PINYIN_LEN;
476
0
                    }
477
478
0
                    auto end = strchr(buf, ' ');
479
                    // max len for pinyin is 6
480
0
                    int len = MAX_PINYIN_LEN;
481
0
                    if (end != nullptr && end - buf < MAX_PINYIN_LEN) {
482
0
                        len = end - buf;
483
0
                    }
484
                    // set first char '~' just make sure all english word lower than chinese word
485
0
                    *dest = 126;
486
0
                    memcpy(dest + 1, buf, len);
487
0
                    dest += (len + 1);
488
0
                    from += 3;
489
0
                } else {
490
0
                    do_memcpy(dest, from, 3);
491
0
                }
492
0
            }
493
0
        }
494
495
0
        *out_len = dest - out;
496
0
    }
497
};
498
// +-----------------------------------+
499
// | 丝                                |
500
// +-----------------------------------+
501
// 1 row in set, 1 warning (0.00 sec)
502
// mysql> select char(14989469 using utf8);
503
// +---------------------------+
504
// | char(14989469 using utf8) |
505
// +---------------------------+
506
// | 丝                        |
507
// +---------------------------+
508
// 1 row in set, 1 warning (0.00 sec)
509
// mysql> select char(0xe5, 0xa4, 0x9a, 0xe7, 0x9d, 0xbf, 0xe4, 0xb8, 0x9d, 68, 111, 114, 105, 115 using utf8);
510
// +---------------------------------------------------------------------------------------------+
511
// | char(0xe5, 0xa4, 0x9a, 0xe7, 0x9d, 0xbf, 0xe4, 0xb8, 0x9d, 68, 111, 114, 105, 115 using utf8) |
512
// +---------------------------------------------------------------------------------------------+
513
// | 多睿丝 Doris                                                                                 |
514
// +---------------------------------------------------------------------------------------------+
515
// mysql> select char(68, 111, 114, 0, 105, null, 115 using utf8);
516
// +--------------------------------------------------+
517
// | char(68, 111, 114, 0, 105, null, 115 using utf8) |
518
// +--------------------------------------------------+
519
// | Dor is                                           |
520
// +--------------------------------------------------+
521
522
// return null:
523
// mysql>  select char(255 using utf8);
524
// +----------------------+
525
// | char(255 using utf8) |
526
// +----------------------+
527
// | NULL                 |
528
// +----------------------+
529
// 1 row in set, 2 warnings (0.00 sec)
530
//
531
// mysql> show warnings;
532
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
533
// | Level   | Code | Message                                                                                                                                                                     |
534
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
535
// | Warning | 3719 | 'utf8' is currently an alias for the character set UTF8MB3, but will be an alias for UTF8MB4 in a future release. Please consider using UTF8MB4 in order to be unambiguous. |
536
// | Warning | 1300 | Invalid utf8mb3 character string: 'FF'                                                                                                                                      |
537
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
538
// 2 rows in set (0.01 sec)
539
540
// max int value:
541
// mysql> select char(18446744073709551615);
542
// +--------------------------------------------------------+
543
// | char(18446744073709551615)                             |
544
// +--------------------------------------------------------+
545
// | 0xFFFFFFFF                                             |
546
// +--------------------------------------------------------+
547
// 1 row in set (0.00 sec)
548
//
549
// mysql> select char(18446744073709551616);
550
// +--------------------------------------------------------+
551
// | char(18446744073709551616)                             |
552
// +--------------------------------------------------------+
553
// | 0xFFFFFFFF                                             |
554
// +--------------------------------------------------------+
555
// 1 row in set, 1 warning (0.00 sec)
556
//
557
// mysql> show warnings;
558
// +---------+------+-----------------------------------------------------------+
559
// | Level   | Code | Message                                                   |
560
// +---------+------+-----------------------------------------------------------+
561
// | Warning | 1292 | Truncated incorrect DECIMAL value: '18446744073709551616' |
562
// +---------+------+-----------------------------------------------------------+
563
// 1 row in set (0.00 sec)
564
565
// table columns:
566
// mysql> select * from t;
567
// +------+------+------+
568
// | f1   | f2   | f3   |
569
// +------+------+------+
570
// |  228 |  184 |  157 |
571
// |  228 |  184 |    0 |
572
// |  228 |  184 |   99 |
573
// |   99 |  228 |  184 |
574
// +------+------+------+
575
// 4 rows in set (0.00 sec)
576
//
577
// mysql> select char(f1, f2, f3 using utf8) from t;
578
// +-----------------------------+
579
// | char(f1, f2, f3 using utf8) |
580
// +-----------------------------+
581
// | 丝                          |
582
// |                             |
583
// |                             |
584
// | c                           |
585
// +-----------------------------+
586
// 4 rows in set, 4 warnings (0.00 sec)
587
//
588
// mysql> show warnings;
589
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
590
// | Level   | Code | Message                                                                                                                                                                     |
591
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
592
// | Warning | 3719 | 'utf8' is currently an alias for the character set UTF8MB3, but will be an alias for UTF8MB4 in a future release. Please consider using UTF8MB4 in order to be unambiguous. |
593
// | Warning | 1300 | Invalid utf8mb3 character string: 'E4B800'                                                                                                                                  |
594
// | Warning | 1300 | Invalid utf8mb3 character string: 'E4B863'                                                                                                                                  |
595
// | Warning | 1300 | Invalid utf8mb3 character string: 'E4B8'                                                                                                                                    |
596
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
597
class FunctionIntToChar : public IFunction {
598
public:
599
    static constexpr auto name = "char";
600
2
    static FunctionPtr create() { return std::make_shared<FunctionIntToChar>(); }
601
0
    String get_name() const override { return name; }
602
0
    size_t get_number_of_arguments() const override { return 0; }
603
1
    bool is_variadic() const override { return true; }
604
605
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
606
0
        return make_nullable(std::make_shared<DataTypeString>());
607
0
    }
608
0
    bool use_default_implementation_for_nulls() const override { return false; }
609
610
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
611
0
                        uint32_t result, size_t input_rows_count) const override {
612
0
        DCHECK_GE(arguments.size(), 2);
613
614
0
        int argument_size = arguments.size();
615
0
        std::vector<ColumnPtr> str_columns(argument_size - 1);
616
0
        std::vector<const ColumnString::Offsets*> offsets_list(argument_size - 1);
617
0
        std::vector<const ColumnString::Chars*> chars_list(argument_size - 1);
618
619
        // convert each argument columns to column string and then concat the string columns
620
0
        for (size_t i = 1; i < argument_size; ++i) {
621
0
            if (auto const_column = check_and_get_column<const ColumnConst>(
622
0
                        *block.get_by_position(arguments[i]).column)) {
623
                // ignore null
624
0
                if (const_column->only_null()) {
625
0
                    str_columns[i - 1] = nullptr;
626
0
                } else {
627
0
                    auto str_column = ColumnString::create();
628
0
                    auto& chars = str_column->get_chars();
629
0
                    auto& offsets = str_column->get_offsets();
630
0
                    offsets.resize(1);
631
0
                    const ColumnInt32* int_column;
632
0
                    if (auto* nullable = check_and_get_column<const ColumnNullable>(
633
0
                                const_column->get_data_column())) {
634
0
                        int_column = assert_cast<const ColumnInt32*>(
635
0
                                nullable->get_nested_column_ptr().get());
636
0
                    } else {
637
0
                        int_column =
638
0
                                assert_cast<const ColumnInt32*>(&const_column->get_data_column());
639
0
                    }
640
0
                    int int_val = int_column->get_int(0);
641
0
                    integer_to_char_(0, &int_val, chars, offsets);
642
0
                    str_columns[i - 1] =
643
0
                            ColumnConst::create(std::move(str_column), input_rows_count);
644
0
                }
645
0
                offsets_list[i - 1] = nullptr;
646
0
                chars_list[i - 1] = nullptr;
647
0
            } else {
648
0
                auto str_column = ColumnString::create();
649
0
                auto& chars = str_column->get_chars();
650
0
                auto& offsets = str_column->get_offsets();
651
                // data.resize(input_rows_count);
652
0
                offsets.resize(input_rows_count);
653
654
0
                if (auto nullable = check_and_get_column<const ColumnNullable>(
655
0
                            *block.get_by_position(arguments[i]).column)) {
656
0
                    const auto* int_data =
657
0
                            assert_cast<const ColumnInt32*>(nullable->get_nested_column_ptr().get())
658
0
                                    ->get_data()
659
0
                                    .data();
660
0
                    const auto* null_map_data = nullable->get_null_map_data().data();
661
0
                    for (size_t j = 0; j < input_rows_count; ++j) {
662
                        // ignore null
663
0
                        if (null_map_data[j]) {
664
0
                            offsets[j] = offsets[j - 1];
665
0
                        } else {
666
0
                            integer_to_char_(j, int_data + j, chars, offsets);
667
0
                        }
668
0
                    }
669
0
                } else {
670
0
                    const auto* int_data = assert_cast<const ColumnInt32*>(
671
0
                                                   block.get_by_position(arguments[i]).column.get())
672
0
                                                   ->get_data()
673
0
                                                   .data();
674
0
                    for (size_t j = 0; j < input_rows_count; ++j) {
675
0
                        integer_to_char_(j, int_data + j, chars, offsets);
676
0
                    }
677
0
                }
678
0
                offsets_list[i - 1] = &str_column->get_offsets();
679
0
                chars_list[i - 1] = &str_column->get_chars();
680
0
                str_columns[i - 1] = std::move(str_column);
681
0
            }
682
0
        }
683
684
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
685
0
        auto res = ColumnString::create();
686
0
        auto& res_data = res->get_chars();
687
0
        auto& res_offset = res->get_offsets();
688
689
0
        size_t res_reserve_size = 0;
690
0
        for (size_t i = 0; i < argument_size - 1; ++i) {
691
0
            if (!str_columns[i]) {
692
0
                continue;
693
0
            }
694
0
            if (auto const_column = check_and_get_column<const ColumnConst>(*str_columns[i])) {
695
0
                auto str_column =
696
0
                        assert_cast<const ColumnString*>(&(const_column->get_data_column()));
697
0
                auto& offsets = str_column->get_offsets();
698
0
                res_reserve_size += (offsets[0] - offsets[-1]) * input_rows_count;
699
0
            } else {
700
0
                for (size_t j = 0; j < input_rows_count; ++j) {
701
0
                    size_t append = (*offsets_list[i])[j] - (*offsets_list[i])[j - 1];
702
                    // check whether the output might overflow(unlikely)
703
0
                    if (UNLIKELY(UINT_MAX - append < res_reserve_size)) {
704
0
                        return Status::BufferAllocFailed(
705
0
                                "function char output is too large to allocate");
706
0
                    }
707
0
                    res_reserve_size += append;
708
0
                }
709
0
            }
710
0
        }
711
0
        if ((UNLIKELY(UINT_MAX - input_rows_count < res_reserve_size))) {
712
0
            return Status::BufferAllocFailed("function char output is too large to allocate");
713
0
        }
714
0
        ColumnString::check_chars_length(res_reserve_size, 0);
715
0
        res_data.resize(res_reserve_size);
716
0
        res_offset.resize(input_rows_count);
717
718
0
        for (size_t i = 0; i < input_rows_count; ++i) {
719
0
            int current_length = 0;
720
0
            for (size_t j = 0; j < argument_size - 1; ++j) {
721
0
                if (!str_columns[j]) {
722
0
                    continue;
723
0
                }
724
0
                if (auto const_column = check_and_get_column<const ColumnConst>(*str_columns[j])) {
725
0
                    auto str_column = assert_cast<const ColumnString*, TypeCheckOnRelease::DISABLE>(
726
0
                            &(const_column->get_data_column()));
727
0
                    auto data_item = str_column->get_data_at(0);
728
0
                    memcpy_small_allow_read_write_overflow15(
729
0
                            &res_data[res_offset[i - 1]] + current_length, data_item.data,
730
0
                            data_item.size);
731
0
                    current_length += data_item.size;
732
0
                } else {
733
0
                    auto& current_offsets = *offsets_list[j];
734
0
                    auto& current_chars = *chars_list[j];
735
736
0
                    int size = current_offsets[i] - current_offsets[i - 1];
737
0
                    if (size > 0) {
738
0
                        memcpy_small_allow_read_write_overflow15(
739
0
                                &res_data[res_offset[i - 1]] + current_length,
740
0
                                &current_chars[current_offsets[i - 1]], size);
741
0
                        current_length += size;
742
0
                    }
743
0
                }
744
0
            }
745
0
            res_offset[i] = res_offset[i - 1] + current_length;
746
0
        }
747
748
        // validate utf8
749
0
        auto* null_map_data = null_map->get_data().data();
750
0
        for (size_t i = 0; i < input_rows_count; ++i) {
751
0
            if (!validate_utf8((const char*)(&res_data[res_offset[i - 1]]),
752
0
                               res_offset[i] - res_offset[i - 1])) {
753
0
                null_map_data[i] = 1;
754
0
            }
755
0
        }
756
757
0
        block.get_by_position(result).column =
758
0
                ColumnNullable::create(std::move(res), std::move(null_map));
759
0
        return Status::OK();
760
0
    }
761
762
private:
763
    void integer_to_char_(int line_num, const int* num, ColumnString::Chars& chars,
764
0
                          IColumn::Offsets& offsets) const {
765
0
        if (0 == *num) {
766
0
            chars.push_back('\0');
767
0
            offsets[line_num] = offsets[line_num - 1] + 1;
768
0
            return;
769
0
        }
770
0
        const char* bytes = (const char*)(num);
771
0
        if constexpr (std::endian::native == std::endian::little) {
772
0
            int k = 3;
773
0
            for (; k >= 0; --k) {
774
0
                if (bytes[k]) {
775
0
                    break;
776
0
                }
777
0
            }
778
0
            offsets[line_num] = offsets[line_num - 1] + k + 1;
779
0
            for (; k >= 0; --k) {
780
0
                chars.push_back(bytes[k] ? bytes[k] : '\0');
781
0
            }
782
        } else if constexpr (std::endian::native == std::endian::big) {
783
            int k = 0;
784
            for (; k < 4; ++k) {
785
                if (bytes[k]) {
786
                    break;
787
                }
788
            }
789
            offsets[line_num] = offsets[line_num - 1] + 4 - k;
790
            for (; k < 4; ++k) {
791
                chars.push_back(bytes[k] ? bytes[k] : '\0');
792
            }
793
        } else {
794
            static_assert(std::endian::native == std::endian::big ||
795
                                  std::endian::native == std::endian::little,
796
                          "Unsupported endianness");
797
        }
798
0
    }
799
};
800
801
class FunctionNgramSearch : public IFunction {
802
public:
803
    static constexpr auto name = "ngram_search";
804
2
    static FunctionPtr create() { return std::make_shared<FunctionNgramSearch>(); }
805
1
    String get_name() const override { return name; }
806
0
    size_t get_number_of_arguments() const override { return 3; }
807
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
808
0
        return std::make_shared<DataTypeFloat64>();
809
0
    }
810
811
    // ngram_search(text,pattern,gram_num)
812
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
813
0
                        uint32_t result, size_t input_rows_count) const override {
814
0
        CHECK_EQ(arguments.size(), 3);
815
0
        auto col_res = ColumnFloat64::create();
816
0
        bool col_const[3];
817
0
        ColumnPtr argument_columns[3];
818
0
        for (int i = 0; i < 3; ++i) {
819
0
            std::tie(argument_columns[i], col_const[i]) =
820
0
                    unpack_if_const(block.get_by_position(arguments[i]).column);
821
0
        }
822
0
        auto pattern = assert_cast<const ColumnString*>(argument_columns[1].get())->get_data_at(0);
823
0
        auto gram_num = assert_cast<const ColumnInt32*>(argument_columns[2].get())->get_element(0);
824
0
        const auto* text_col = assert_cast<const ColumnString*>(argument_columns[0].get());
825
826
0
        if (col_const[0]) {
827
0
            _execute_impl<true>(text_col, pattern, gram_num, *col_res, input_rows_count);
828
0
        } else {
829
0
            _execute_impl<false>(text_col, pattern, gram_num, *col_res, input_rows_count);
830
0
        }
831
832
0
        block.replace_by_position(result, std::move(col_res));
833
0
        return Status::OK();
834
0
    }
835
836
private:
837
    using NgramMap = phmap::flat_hash_map<uint32_t, uint8_t>;
838
    constexpr static auto not_found = 0b00;
839
    constexpr static auto found_in_pattern = 0b01;
840
    constexpr static auto found_in_text = 0b10;
841
    constexpr static auto found_in_pattern_and_text = 0b11;
842
843
0
    uint32_t sub_str_hash(const char* data, int32_t length) const {
844
0
        constexpr static uint32_t seed = 0;
845
0
        return crc32c::Extend(seed, (const uint8_t*)data, length);
846
0
    }
847
848
    template <bool column_const>
849
    void _execute_impl(const ColumnString* text_col, StringRef& pattern, int gram_num,
850
0
                       ColumnFloat64& res, size_t size) const {
851
0
        auto& res_data = res.get_data();
852
0
        res_data.resize_fill(size, 0);
853
        // If the length of the pattern is less than gram_num, return 0.
854
0
        if (pattern.size < gram_num) {
855
0
            return;
856
0
        }
857
858
        // Build a map by pattern string, which will be used repeatedly in the following loop.
859
0
        NgramMap pattern_map;
860
0
        int pattern_count = get_pattern_set(pattern_map, pattern, gram_num);
861
        // Each time a loop is executed, the map will be modified, so it needs to be restored afterward.
862
0
        std::vector<uint32_t> restore_map;
863
864
0
        for (int i = 0; i < size; i++) {
865
0
            auto text = text_col->get_data_at(index_check_const<column_const>(i));
866
0
            if (text.size < gram_num) {
867
                // If the length of the text is less than gram_num, return 0.
868
0
                continue;
869
0
            }
870
0
            restore_map.reserve(text.size);
871
0
            auto [text_count, intersection_count] =
872
0
                    get_text_set(text, gram_num, pattern_map, restore_map);
873
874
            // 2 * |Intersection| / (|text substr set| + |pattern substr set|)
875
0
            res_data[i] = 2.0 * intersection_count / (text_count + pattern_count);
876
0
        }
877
0
    }
Unexecuted instantiation: _ZNK5doris19FunctionNgramSearch13_execute_implILb1EEEvPKNS_9ColumnStrIjEERNS_9StringRefEiRNS_12ColumnVectorILNS_13PrimitiveTypeE9EEEm
Unexecuted instantiation: _ZNK5doris19FunctionNgramSearch13_execute_implILb0EEEvPKNS_9ColumnStrIjEERNS_9StringRefEiRNS_12ColumnVectorILNS_13PrimitiveTypeE9EEEm
878
879
0
    size_t get_pattern_set(NgramMap& pattern_map, StringRef& pattern, int gram_num) const {
880
0
        size_t pattern_count = 0;
881
0
        for (int i = 0; i + gram_num <= pattern.size; i++) {
882
0
            uint32_t cur_hash = sub_str_hash(pattern.data + i, gram_num);
883
0
            if (!pattern_map.contains(cur_hash)) {
884
0
                pattern_map[cur_hash] = found_in_pattern;
885
0
                pattern_count++;
886
0
            }
887
0
        }
888
0
        return pattern_count;
889
0
    }
890
891
    std::pair<size_t, size_t> get_text_set(StringRef& text, int gram_num, NgramMap& pattern_map,
892
0
                                           std::vector<uint32_t>& restore_map) const {
893
0
        restore_map.clear();
894
        //intersection_count indicates a substring both in pattern and text.
895
0
        size_t text_count = 0, intersection_count = 0;
896
0
        for (int i = 0; i + gram_num <= text.size; i++) {
897
0
            uint32_t cur_hash = sub_str_hash(text.data + i, gram_num);
898
0
            auto& val = pattern_map[cur_hash];
899
0
            if (val == not_found) {
900
0
                val ^= found_in_text;
901
0
                DCHECK(val == found_in_text);
902
                // only found in text
903
0
                text_count++;
904
0
                restore_map.push_back(cur_hash);
905
0
            } else if (val == found_in_pattern) {
906
0
                val ^= found_in_text;
907
0
                DCHECK(val == found_in_pattern_and_text);
908
                // found in text and pattern
909
0
                text_count++;
910
0
                intersection_count++;
911
0
                restore_map.push_back(cur_hash);
912
0
            }
913
0
        }
914
        // Restore the pattern_map.
915
0
        for (auto& restore_hash : restore_map) {
916
0
            pattern_map[restore_hash] ^= found_in_text;
917
0
        }
918
919
0
        return {text_count, intersection_count};
920
0
    }
921
};
922
923
class FunctionTranslate : public IFunction {
924
public:
925
    static constexpr auto name = "translate";
926
    using AsciiMap = std::array<UInt8, 128>;
927
    constexpr static UInt8 DELETE_CHAR = 255; // 255 means delete this char
928
2
    static FunctionPtr create() { return std::make_shared<FunctionTranslate>(); }
929
1
    String get_name() const override { return name; }
930
0
    size_t get_number_of_arguments() const override { return 3; }
931
932
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
933
0
        return std::make_shared<DataTypeString>();
934
0
    };
935
936
1
    DataTypes get_variadic_argument_types_impl() const override {
937
1
        return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
938
1
                std::make_shared<DataTypeString>()};
939
1
    }
940
941
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
942
0
                        uint32_t result, size_t input_rows_count) const override {
943
0
        CHECK_EQ(arguments.size(), 3);
944
0
        auto col_res = ColumnString::create();
945
0
        bool col_const[3];
946
0
        ColumnPtr argument_columns[3];
947
0
        for (int i = 0; i < 3; ++i) {
948
0
            col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column);
949
0
        }
950
0
        argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>(
951
0
                                                     *block.get_by_position(arguments[0]).column)
952
0
                                                     .convert_to_full_column()
953
0
                                           : block.get_by_position(arguments[0]).column;
954
0
        default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments);
955
956
0
        const auto* col_source = assert_cast<const ColumnString*>(argument_columns[0].get());
957
0
        const auto* col_from = assert_cast<const ColumnString*>(argument_columns[1].get());
958
0
        const auto* col_to = assert_cast<const ColumnString*>(argument_columns[2].get());
959
960
0
        bool is_ascii = col_source->is_ascii() && col_from->is_ascii() && col_to->is_ascii();
961
0
        auto impl_vectors = impl_vectors_utf8<false>;
962
0
        if (col_const[1] && col_const[2] && is_ascii) {
963
0
            impl_vectors = impl_vectors_ascii<true>;
964
0
        } else if (col_const[1] && col_const[2]) {
965
0
            impl_vectors = impl_vectors_utf8<true>;
966
0
        } else if (is_ascii) {
967
0
            impl_vectors = impl_vectors_ascii<false>;
968
0
        }
969
0
        impl_vectors(col_source, col_from, col_to, col_res.get());
970
0
        block.get_by_position(result).column = std::move(col_res);
971
0
        return Status::OK();
972
0
    }
973
974
private:
975
    template <bool IsConst>
976
    static void impl_vectors_ascii(const ColumnString* col_source, const ColumnString* col_from,
977
0
                                   const ColumnString* col_to, ColumnString* col_res) {
978
0
        auto& res_chars = col_res->get_chars();
979
0
        auto& res_offsets = col_res->get_offsets();
980
0
        res_chars.reserve(col_source->get_chars().size());
981
0
        res_offsets.reserve(col_source->get_offsets().size());
982
0
        DCHECK_EQ(col_res->size(), 0);
983
0
        AsciiMap map;
984
0
        if (IsConst) {
985
0
            const auto& from_str = col_from->get_data_at(0);
986
0
            const auto& to_str = col_to->get_data_at(0);
987
0
            if (!build_translate_map_ascii(map, from_str, to_str)) {
988
                // if the map is not need delete char, we can directly copy the source string,then use map to translate
989
0
                res_offsets.insert(col_source->get_offsets().begin(),
990
0
                                   col_source->get_offsets().end());
991
0
                res_chars.insert(col_source->get_chars().begin(), col_source->get_chars().end());
992
0
                for (int i = 0; i < res_chars.size(); ++i) {
993
0
                    res_chars[i] = map[res_chars[i]]; // translate the chars
994
0
                }
995
0
                return; // no need to translate
996
0
            }
997
0
        }
998
999
0
        auto res_size = 0;
1000
0
        auto* begin_data = col_res->get_chars().data();
1001
0
        for (size_t i = 0; i < col_source->size(); ++i) {
1002
0
            const auto& source_str = col_source->get_data_at(i);
1003
0
            if (!IsConst) {
1004
0
                const auto& from_str = col_from->get_data_at(i);
1005
0
                const auto& to_str = col_to->get_data_at(i);
1006
0
                build_translate_map_ascii(map, from_str, to_str);
1007
0
            }
1008
0
            auto* dst_data = begin_data + res_size;
1009
0
            res_size += translate_ascii(source_str, map, dst_data);
1010
1011
0
            res_offsets.push_back(res_size);
1012
0
        }
1013
0
        DCHECK_GE(res_chars.capacity(), res_size);
1014
0
        res_chars.resize(res_size);
1015
0
    }
Unexecuted instantiation: _ZN5doris17FunctionTranslate18impl_vectors_asciiILb1EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
Unexecuted instantiation: _ZN5doris17FunctionTranslate18impl_vectors_asciiILb0EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
1016
1017
    // return true if no need delete char
1018
    bool static build_translate_map_ascii(AsciiMap& map, const StringRef& from_str,
1019
0
                                          const StringRef& to_str) {
1020
0
        for (size_t i = 0; i < map.size(); ++i) {
1021
0
            map[i] = i; // initialize map to identity
1022
0
        }
1023
0
        std::array<UInt8, 128> set_map {0};
1024
0
        const auto min_size = std::min(from_str.size, to_str.size);
1025
        // all ascii characters are in the range [0, 127]
1026
0
        for (size_t i = 0; i < min_size; ++i) {
1027
0
            auto from_char = from_str.data[i];
1028
0
            auto to_char = to_str.data[i];
1029
0
            if (set_map[from_char] == 0) {
1030
0
                set_map[from_char] = 1;
1031
0
                map[from_char] = to_char;
1032
0
            }
1033
0
        }
1034
1035
0
        bool need_delete_char = false;
1036
1037
0
        for (size_t i = min_size; i < from_str.size; ++i) {
1038
0
            auto from_char = from_str.data[i];
1039
0
            if (set_map[from_char] == 0) {
1040
0
                set_map[from_char] = 1;
1041
0
                map[from_char] = DELETE_CHAR; // delete this char
1042
0
                need_delete_char = true;
1043
0
            }
1044
0
        }
1045
0
        return need_delete_char;
1046
0
    }
1047
1048
0
    static size_t translate_ascii(const StringRef& source_str, AsciiMap& map, UInt8* dst_data) {
1049
0
        auto* begin_data = dst_data;
1050
0
        for (size_t i = 0; i < source_str.size; ++i) {
1051
0
            auto c = source_str.data[i];
1052
0
            if (map[c] == DELETE_CHAR) {
1053
0
                continue; // delete this char
1054
0
            }
1055
0
            *dst_data++ = map[c];
1056
0
        }
1057
0
        return dst_data - begin_data;
1058
0
    }
1059
1060
    template <bool IsConst>
1061
    static void impl_vectors_utf8(const ColumnString* col_source, const ColumnString* col_from,
1062
0
                                  const ColumnString* col_to, ColumnString* col_res) {
1063
0
        col_res->get_chars().reserve(col_source->get_chars().size());
1064
0
        col_res->get_offsets().reserve(col_source->get_offsets().size());
1065
0
        std::unordered_map<std::string_view, std::string_view> translate_map;
1066
0
        if (IsConst) {
1067
0
            const auto& from_str = col_from->get_data_at(0);
1068
0
            const auto& to_str = col_to->get_data_at(0);
1069
0
            translate_map =
1070
0
                    build_translate_map_utf8(from_str.to_string_view(), to_str.to_string_view());
1071
0
        }
1072
0
        for (size_t i = 0; i < col_source->size(); ++i) {
1073
0
            const auto& source_str = col_source->get_data_at(i);
1074
0
            if (!IsConst) {
1075
0
                const auto& from_str = col_from->get_data_at(i);
1076
0
                const auto& to_str = col_to->get_data_at(i);
1077
0
                translate_map = build_translate_map_utf8(from_str.to_string_view(),
1078
0
                                                         to_str.to_string_view());
1079
0
            }
1080
0
            auto translated_str = translate_utf8(source_str.to_string_view(), translate_map);
1081
0
            col_res->insert_data(translated_str.data(), translated_str.size());
1082
0
        }
1083
0
    }
Unexecuted instantiation: _ZN5doris17FunctionTranslate17impl_vectors_utf8ILb0EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
Unexecuted instantiation: _ZN5doris17FunctionTranslate17impl_vectors_utf8ILb1EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
1084
1085
    static std::unordered_map<std::string_view, std::string_view> build_translate_map_utf8(
1086
0
            const std::string_view& from_str, const std::string_view& to_str) {
1087
0
        std::unordered_map<std::string_view, std::string_view> translate_map;
1088
0
        for (size_t i = 0, from_char_size = 0, j = 0, to_char_size = 0; i < from_str.size();
1089
0
             i += from_char_size, j += to_char_size) {
1090
0
            from_char_size = get_utf8_byte_length(from_str[i]);
1091
0
            to_char_size = j < to_str.size() ? get_utf8_byte_length(to_str[j]) : 0;
1092
0
            auto from_char = from_str.substr(i, from_char_size);
1093
0
            if (translate_map.find(from_char) == translate_map.end()) {
1094
0
                translate_map[from_char] =
1095
0
                        j < to_str.size() ? to_str.substr(j, to_char_size) : std::string_view();
1096
0
            }
1097
0
        }
1098
0
        return translate_map;
1099
0
    }
1100
1101
    static std::string translate_utf8(
1102
            const std::string_view& source_str,
1103
0
            std::unordered_map<std::string_view, std::string_view>& translate_map) {
1104
0
        std::string result;
1105
0
        result.reserve(source_str.size());
1106
0
        for (size_t i = 0, char_size = 0; i < source_str.size(); i += char_size) {
1107
0
            char_size = get_utf8_byte_length(source_str[i]);
1108
0
            auto c = source_str.substr(i, char_size);
1109
0
            if (translate_map.find(c) != translate_map.end()) {
1110
0
                if (!translate_map[c].empty()) {
1111
0
                    result.append(translate_map[c]);
1112
0
                }
1113
0
            } else {
1114
0
                result.append(c);
1115
0
            }
1116
0
        }
1117
0
        return result;
1118
0
    }
1119
};
1120
1121
/// xpath_string(xml, xpath) -> String
1122
/// Returns the text content of the first node that matches the XPath expression.
1123
/// Returns NULL if either xml or xpath is NULL.
1124
/// Returns empty string if the XPath expression matches no nodes.
1125
/// The text content includes the node and all its descendants.
1126
/// Example:
1127
///   xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/b[1]') = 'b1'
1128
///   xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/b[2]') = 'b2'
1129
///   xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/c') = ''
1130
///   xpath_string('invalid xml', '/a/b[1]') = NULL
1131
///   xpath_string(NULL, '/a/b[1]') = NULL
1132
///   xpath_string('<a><b>b1</b><b>b2</b></a>', NULL) = NULL
1133
class FunctionXPathString : public IFunction {
1134
public:
1135
    static constexpr auto name = "xpath_string";
1136
78
    static FunctionPtr create() { return std::make_shared<FunctionXPathString>(); }
1137
1
    String get_name() const override { return name; }
1138
76
    size_t get_number_of_arguments() const override { return 2; }
1139
76
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1140
76
        return make_nullable(std::make_shared<DataTypeString>());
1141
76
    }
1142
1143
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1144
67
                        uint32_t result, size_t input_rows_count) const override {
1145
67
        CHECK_EQ(arguments.size(), 2);
1146
67
        auto col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create());
1147
67
        const auto& [left_col, left_const] =
1148
67
                unpack_if_const(block.get_by_position(arguments[0]).column);
1149
67
        const auto& [right_col, right_const] =
1150
67
                unpack_if_const(block.get_by_position(arguments[1]).column);
1151
67
        const auto& xml_col = *assert_cast<const ColumnString*>(left_col.get());
1152
67
        const auto& xpath_col = *assert_cast<const ColumnString*>(right_col.get());
1153
1154
67
        Status status;
1155
67
        if (left_const && right_const) {
1156
0
            status = execute_vector<true, true>(input_rows_count, xml_col, xpath_col, *col_res);
1157
67
        } else if (left_const) {
1158
22
            status = execute_vector<true, false>(input_rows_count, xml_col, xpath_col, *col_res);
1159
45
        } else if (right_const) {
1160
22
            status = execute_vector<false, true>(input_rows_count, xml_col, xpath_col, *col_res);
1161
23
        } else {
1162
23
            status = execute_vector<false, false>(input_rows_count, xml_col, xpath_col, *col_res);
1163
23
        }
1164
67
        if (!status.ok()) {
1165
0
            return status;
1166
0
        }
1167
1168
67
        block.get_by_position(result).column = std::move(col_res);
1169
67
        return Status::OK();
1170
67
    }
1171
1172
private:
1173
81
    static Status parse_xml(const StringRef& xml_str, pugi::xml_document& xml_doc) {
1174
81
        pugi::xml_parse_result result = xml_doc.load_buffer(xml_str.data, xml_str.size);
1175
81
        if (!result) {
1176
0
            return Status::InvalidArgument("Function {} failed to parse XML string: {}", name,
1177
0
                                           result.description());
1178
0
        }
1179
81
        return Status::OK();
1180
81
    }
1181
1182
84
    static Status build_xpath_query(const StringRef& xpath_str, pugi::xpath_query& xpath_query) {
1183
        // xpath_query will throws xpath_exception on compilation errors.
1184
84
        try {
1185
            // NOTE!!!: don't use to_string_view(), because xpath_str maybe not null-terminated
1186
84
            xpath_query = pugi::xpath_query(xpath_str.to_string().c_str());
1187
84
        } catch (const pugi::xpath_exception& e) {
1188
0
            return Status::InvalidArgument("Function {} failed to build XPath query: {}", name,
1189
0
                                           e.what());
1190
0
        }
1191
84
        return Status::OK();
1192
84
    }
1193
1194
    template <bool left_const, bool right_const>
1195
    static Status execute_vector(const size_t input_rows_count, const ColumnString& xml_col,
1196
67
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
67
        pugi::xml_document xml_doc;
1198
67
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
67
        if constexpr (right_const) {
1201
22
            auto xpath_str = xpath_col.get_data_at(0);
1202
22
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
1
                res_col.insert_many_defaults(input_rows_count);
1205
1
                return Status::OK();
1206
1
            }
1207
21
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
21
        }
1209
22
        if constexpr (left_const) {
1210
22
            auto xml_str = xml_col.get_data_at(0);
1211
22
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
1
                res_col.insert_many_defaults(input_rows_count);
1214
1
                return Status::OK();
1215
1
            }
1216
21
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
21
        }
1218
1219
156
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
89
            if constexpr (!right_const) {
1221
68
                auto xpath_str = xpath_col.get_data_at(i);
1222
68
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
5
                    res_col.insert_default();
1225
5
                    continue;
1226
5
                }
1227
63
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
63
            }
1229
68
            if constexpr (!left_const) {
1230
68
                auto xml_str = xml_col.get_data_at(i);
1231
68
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
4
                    res_col.insert_default();
1234
4
                    continue;
1235
4
                }
1236
64
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
64
            }
1238
64
            std::string text;
1239
89
            try {
1240
89
                text = xpath_query.evaluate_string(xml_doc);
1241
89
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
80
            res_col.insert_data(text.data(), text.size());
1246
80
        }
1247
67
        return Status::OK();
1248
67
    }
Unexecuted instantiation: _ZN5doris19FunctionXPathString14execute_vectorILb1ELb1EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
_ZN5doris19FunctionXPathString14execute_vectorILb1ELb0EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
Line
Count
Source
1196
22
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
22
        pugi::xml_document xml_doc;
1198
22
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
        if constexpr (right_const) {
1201
            auto xpath_str = xpath_col.get_data_at(0);
1202
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
                res_col.insert_many_defaults(input_rows_count);
1205
                return Status::OK();
1206
            }
1207
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
        }
1209
22
        if constexpr (left_const) {
1210
22
            auto xml_str = xml_col.get_data_at(0);
1211
22
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
1
                res_col.insert_many_defaults(input_rows_count);
1214
1
                return Status::OK();
1215
1
            }
1216
21
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
21
        }
1218
1219
43
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
21
            if constexpr (!right_const) {
1221
21
                auto xpath_str = xpath_col.get_data_at(i);
1222
21
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
1
                    res_col.insert_default();
1225
1
                    continue;
1226
1
                }
1227
20
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
20
            }
1229
            if constexpr (!left_const) {
1230
                auto xml_str = xml_col.get_data_at(i);
1231
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
                    res_col.insert_default();
1234
                    continue;
1235
                }
1236
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
            }
1238
21
            std::string text;
1239
21
            try {
1240
21
                text = xpath_query.evaluate_string(xml_doc);
1241
21
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
20
            res_col.insert_data(text.data(), text.size());
1246
20
        }
1247
22
        return Status::OK();
1248
22
    }
_ZN5doris19FunctionXPathString14execute_vectorILb0ELb1EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
Line
Count
Source
1196
22
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
22
        pugi::xml_document xml_doc;
1198
22
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
22
        if constexpr (right_const) {
1201
22
            auto xpath_str = xpath_col.get_data_at(0);
1202
22
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
1
                res_col.insert_many_defaults(input_rows_count);
1205
1
                return Status::OK();
1206
1
            }
1207
21
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
21
        }
1209
        if constexpr (left_const) {
1210
            auto xml_str = xml_col.get_data_at(0);
1211
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
                res_col.insert_many_defaults(input_rows_count);
1214
                return Status::OK();
1215
            }
1216
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
        }
1218
1219
43
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
            if constexpr (!right_const) {
1221
                auto xpath_str = xpath_col.get_data_at(i);
1222
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
                    res_col.insert_default();
1225
                    continue;
1226
                }
1227
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
            }
1229
21
            if constexpr (!left_const) {
1230
21
                auto xml_str = xml_col.get_data_at(i);
1231
21
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
1
                    res_col.insert_default();
1234
1
                    continue;
1235
1
                }
1236
20
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
20
            }
1238
20
            std::string text;
1239
21
            try {
1240
21
                text = xpath_query.evaluate_string(xml_doc);
1241
21
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
20
            res_col.insert_data(text.data(), text.size());
1246
20
        }
1247
22
        return Status::OK();
1248
22
    }
_ZN5doris19FunctionXPathString14execute_vectorILb0ELb0EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
Line
Count
Source
1196
23
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
23
        pugi::xml_document xml_doc;
1198
23
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
        if constexpr (right_const) {
1201
            auto xpath_str = xpath_col.get_data_at(0);
1202
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
                res_col.insert_many_defaults(input_rows_count);
1205
                return Status::OK();
1206
            }
1207
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
        }
1209
        if constexpr (left_const) {
1210
            auto xml_str = xml_col.get_data_at(0);
1211
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
                res_col.insert_many_defaults(input_rows_count);
1214
                return Status::OK();
1215
            }
1216
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
        }
1218
1219
70
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
47
            if constexpr (!right_const) {
1221
47
                auto xpath_str = xpath_col.get_data_at(i);
1222
47
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
4
                    res_col.insert_default();
1225
4
                    continue;
1226
4
                }
1227
43
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
43
            }
1229
47
            if constexpr (!left_const) {
1230
47
                auto xml_str = xml_col.get_data_at(i);
1231
47
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
3
                    res_col.insert_default();
1234
3
                    continue;
1235
3
                }
1236
44
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
44
            }
1238
44
            std::string text;
1239
47
            try {
1240
47
                text = xpath_query.evaluate_string(xml_doc);
1241
47
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
40
            res_col.insert_data(text.data(), text.size());
1246
40
        }
1247
23
        return Status::OK();
1248
23
    }
1249
};
1250
1251
class MakeSetImpl {
1252
public:
1253
    static constexpr auto name = "make_set";
1254
1255
0
    static size_t get_number_of_arguments() { return 0; }
1256
1
    static bool is_variadic() { return true; }
1257
0
    static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
1258
0
        if (arguments[0].get()->is_nullable()) {
1259
0
            return make_nullable(std::make_shared<DataTypeString>());
1260
0
        }
1261
0
        return std::make_shared<DataTypeString>();
1262
0
    }
1263
1264
    static bool is_return_nullable(bool has_nullable,
1265
0
                                   const std::vector<ColumnWithConstAndNullMap>& cols_info) {
1266
0
        return cols_info[0].null_map != nullptr;
1267
0
    }
1268
1269
    static bool execute_const_null(ColumnString::MutablePtr& res_col,
1270
                                   PaddedPODArray<UInt8>& res_null_map_data,
1271
0
                                   size_t input_rows_count, size_t null_index) {
1272
0
        if (null_index == 1) {
1273
0
            res_col->insert_many_defaults(input_rows_count);
1274
0
            res_null_map_data.assign(input_rows_count, (UInt8)1);
1275
0
            return true;
1276
0
        }
1277
0
        return false;
1278
0
    }
1279
1280
    static void execute(const std::vector<ColumnWithConstAndNullMap>& column_infos,
1281
                        ColumnString::MutablePtr& res_col, PaddedPODArray<UInt8>& res_null_map_data,
1282
0
                        size_t input_rows_count) {
1283
0
        static constexpr char SEPARATOR = ',';
1284
0
        const auto& bit_data =
1285
0
                assert_cast<const ColumnInt64&>(*column_infos[0].nested_col).get_data();
1286
0
        std::vector<const ColumnString*> str_cols(column_infos.size());
1287
0
        for (size_t i = 1; i < column_infos.size(); ++i) {
1288
0
            str_cols[i] = assert_cast<const ColumnString*>(column_infos[i].nested_col);
1289
0
        }
1290
1291
0
        for (size_t row = 0; row < input_rows_count; ++row) {
1292
0
            if (column_infos[0].is_null_at(row)) {
1293
0
                res_col->insert_default();
1294
0
                res_null_map_data[row] = 1;
1295
0
                continue;
1296
0
            }
1297
1298
0
            uint64_t bit = bit_data[column_infos[0].is_const ? 0 : row];
1299
0
            uint64_t col_pos = __builtin_ffsll(bit);
1300
0
            ColumnString::Chars data;
1301
0
            while (col_pos != 0 && col_pos < column_infos.size() && bit != 0) {
1302
0
                if (!column_infos[col_pos].is_null_at(row)) {
1303
                    /* Here insert `str,` directly to support the case below:
1304
                     * SELECT MAKE_SET(3, '', 'a');
1305
                     * the exception result should be ',a'.
1306
                     */
1307
0
                    auto s_ref = str_cols[col_pos]->get_data_at(
1308
0
                            column_infos[col_pos].is_const ? 0 : row);
1309
0
                    data.insert(s_ref.data, s_ref.data + s_ref.size);
1310
0
                    data.push_back(SEPARATOR);
1311
0
                }
1312
0
                bit &= ~(1ULL << (col_pos - 1));
1313
0
                col_pos = __builtin_ffsll(bit);
1314
0
            }
1315
            // remove the last ','
1316
0
            if (!data.empty()) {
1317
0
                data.pop_back();
1318
0
            }
1319
0
            res_col->insert_data(reinterpret_cast<const char*>(data.data()), data.size());
1320
0
        }
1321
0
    }
1322
};
1323
1324
class FunctionExportSet : public IFunction {
1325
public:
1326
    static constexpr auto name = "export_set";
1327
2
    static FunctionPtr create() { return std::make_shared<FunctionExportSet>(); }
1328
0
    String get_name() const override { return name; }
1329
0
    size_t get_number_of_arguments() const override { return 0; }
1330
1
    bool is_variadic() const override { return true; }
1331
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1332
0
        return std::make_shared<DataTypeString>();
1333
0
    }
1334
1335
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1336
0
                        uint32_t result, size_t input_rows_count) const override {
1337
0
        auto res_col = ColumnString::create();
1338
1339
0
        const size_t arg_size = arguments.size();
1340
0
        bool col_const[5];
1341
0
        ColumnPtr arg_cols[5];
1342
0
        bool all_const = true;
1343
0
        for (int i = 1; i < arg_size; ++i) {
1344
0
            col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column);
1345
0
            all_const = all_const && col_const[i];
1346
0
        }
1347
0
        std::tie(arg_cols[0], col_const[0]) =
1348
0
                unpack_if_const(block.get_by_position(arguments[0]).column);
1349
0
        if (arg_size == 3) {
1350
0
            default_preprocess_parameter_columns(arg_cols, col_const, {1, 2}, block, arguments);
1351
0
        } else if (arg_size == 4) {
1352
0
            default_preprocess_parameter_columns(arg_cols, col_const, {1, 2, 3}, block, arguments);
1353
0
        } else if (arg_size == 5) {
1354
0
            default_preprocess_parameter_columns(arg_cols, col_const, {1, 2, 3, 4}, block,
1355
0
                                                 arguments);
1356
0
        }
1357
1358
0
        const auto* bit_col = assert_cast<const ColumnInt128*>(arg_cols[0].get());
1359
0
        const auto* on_col = assert_cast<const ColumnString*>(arg_cols[1].get());
1360
0
        const auto* off_col = assert_cast<const ColumnString*>(arg_cols[2].get());
1361
0
        const ColumnString* sep_col = nullptr;
1362
0
        const ColumnInt32* num_bits_col = nullptr;
1363
0
        if (arg_size > 3) {
1364
0
            sep_col = assert_cast<const ColumnString*>(arg_cols[3].get());
1365
0
            if (arg_size == 5) {
1366
0
                num_bits_col = assert_cast<const ColumnInt32*>(arg_cols[4].get());
1367
0
            }
1368
0
        }
1369
1370
0
        for (size_t i = 0; i < input_rows_count; ++i) {
1371
0
            uint64_t bit =
1372
0
                    check_and_get_bit(bit_col->get_element(index_check_const(i, col_const[0])));
1373
1374
0
            size_t idx_for_args = all_const ? 0 : i;
1375
0
            StringRef on = on_col->get_data_at(idx_for_args);
1376
0
            StringRef off = off_col->get_data_at(idx_for_args);
1377
0
            StringRef separator(",", 1);
1378
0
            int8_t num_of_bits = 64;
1379
1380
0
            if (arg_size > 3) {
1381
0
                separator = sep_col->get_data_at(idx_for_args);
1382
0
                if (arg_size == 5) {
1383
0
                    num_of_bits =
1384
0
                            check_and_get_num_of_bits(num_bits_col->get_element(idx_for_args));
1385
0
                }
1386
0
            }
1387
1388
0
            execute_single(bit, on, off, separator, num_of_bits, *res_col);
1389
0
        }
1390
0
        block.replace_by_position(result, std::move(res_col));
1391
0
        return Status::OK();
1392
0
    }
1393
1394
private:
1395
    /* The valid range of the input `bit` parameter should be [-2^63, 2^64 - 1]
1396
     * If it exceeds this range, the MAX/MIN values of the signed 64-bit integer are used for calculation
1397
     * This behavior is consistent with MySQL.
1398
     */
1399
0
    uint64_t check_and_get_bit(__int128 col_bit_val) const {
1400
0
        if (col_bit_val > ULLONG_MAX) {
1401
0
            return LLONG_MAX;
1402
0
        } else if (col_bit_val < LLONG_MIN) {
1403
0
            return LLONG_MIN;
1404
0
        }
1405
0
        return static_cast<uint64_t>(col_bit_val);
1406
0
    }
1407
1408
    // If the input value is not in the range [0, 64], return default value 64
1409
0
    int8_t check_and_get_num_of_bits(int32_t col_num_of_bits_val) const {
1410
0
        if (col_num_of_bits_val >= 0 && col_num_of_bits_val <= 64) {
1411
0
            return static_cast<int8_t>(col_num_of_bits_val);
1412
0
        }
1413
0
        return 64;
1414
0
    }
1415
1416
    void execute_single(uint64_t bit, const StringRef& on, const StringRef& off,
1417
                        const StringRef& separator, int8_t num_of_bits,
1418
0
                        ColumnString& res_col) const {
1419
0
        ColumnString::Chars data;
1420
0
        data.reserve(std::max(on.size, off.size) * num_of_bits +
1421
0
                     separator.size * (num_of_bits - 1));
1422
1423
0
        while (bit && num_of_bits) {
1424
0
            if (bit & 1) {
1425
0
                data.insert(on.data, on.data + on.size);
1426
0
            } else {
1427
0
                data.insert(off.data, off.data + off.size);
1428
0
            }
1429
0
            bit >>= 1;
1430
0
            if (--num_of_bits) {
1431
0
                data.insert(separator.data, separator.data + separator.size);
1432
0
            }
1433
0
        }
1434
1435
0
        if (num_of_bits > 0) {
1436
0
            ColumnString::Chars off_sep_combo;
1437
0
            off_sep_combo.reserve(separator.size + off.size);
1438
0
            off_sep_combo.insert(off_sep_combo.end(), off.data, off.data + off.size);
1439
0
            off_sep_combo.insert(off_sep_combo.end(), separator.data,
1440
0
                                 separator.data + separator.size);
1441
1442
0
            for (size_t i = 0; i < num_of_bits; ++i) {
1443
0
                data.insert(off_sep_combo.data(), off_sep_combo.data() + off_sep_combo.size());
1444
0
            }
1445
0
            data.erase(data.end() - separator.size, data.end());
1446
0
        }
1447
1448
0
        res_col.insert_data(reinterpret_cast<const char*>(data.data()), data.size());
1449
0
    }
1450
};
1451
1452
// ATTN: for debug only
1453
// compute crc32 hash value as the same way in `VOlapTablePartitionParam::find_tablets()`
1454
class FunctionCrc32Internal : public IFunction {
1455
public:
1456
    static constexpr auto name = "crc32_internal";
1457
2
    static FunctionPtr create() { return std::make_shared<FunctionCrc32Internal>(); }
1458
0
    String get_name() const override { return name; }
1459
0
    size_t get_number_of_arguments() const override { return 0; }
1460
1
    bool is_variadic() const override { return true; }
1461
0
    bool use_default_implementation_for_nulls() const override { return false; }
1462
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1463
0
        return std::make_shared<DataTypeInt64>();
1464
0
    }
1465
1466
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1467
0
                        uint32_t result, size_t input_rows_count) const override {
1468
0
        DCHECK_GE(arguments.size(), 1);
1469
1470
0
        auto argument_size = arguments.size();
1471
0
        std::vector<ColumnPtr> argument_columns(argument_size);
1472
0
        std::vector<PrimitiveType> argument_primitive_types(argument_size);
1473
1474
0
        for (size_t i = 0; i < argument_size; ++i) {
1475
0
            argument_columns[i] =
1476
0
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
1477
0
            argument_primitive_types[i] =
1478
0
                    block.get_by_position(arguments[i]).type->get_primitive_type();
1479
0
        }
1480
1481
0
        auto res_col = ColumnInt64::create();
1482
0
        auto& res_data = res_col->get_data();
1483
0
        res_data.resize_fill(input_rows_count, 0);
1484
1485
0
        for (size_t i = 0; i < input_rows_count; ++i) {
1486
0
            uint32_t hash_val = 0;
1487
0
            for (size_t j = 0; j < argument_size; ++j) {
1488
0
                const auto& column = argument_columns[j];
1489
0
                auto primitive_type = argument_primitive_types[j];
1490
0
                auto val = column->get_data_at(i);
1491
0
                if (val.data != nullptr) {
1492
0
                    hash_val = RawValue::zlib_crc32(val.data, val.size, primitive_type, hash_val);
1493
0
                } else {
1494
0
                    hash_val = HashUtil::zlib_crc_hash_null(hash_val);
1495
0
                }
1496
0
            }
1497
0
            res_data[i] = hash_val;
1498
0
        }
1499
1500
0
        block.replace_by_position(result, std::move(res_col));
1501
0
        return Status::OK();
1502
0
    }
1503
};
1504
1505
class FunctionUnicodeNormalize : public IFunction {
1506
public:
1507
    static constexpr auto name = "unicode_normalize";
1508
1509
9
    static FunctionPtr create() { return std::make_shared<FunctionUnicodeNormalize>(); }
1510
1511
3
    String get_name() const override { return name; }
1512
1513
7
    size_t get_number_of_arguments() const override { return 2; }
1514
1515
7
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1516
7
        if (arguments.size() != 2 || !is_string_type(arguments[0]->get_primitive_type()) ||
1517
7
            !is_string_type(arguments[1]->get_primitive_type())) {
1518
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
1519
0
                                   "Illegal type {} and {} of arguments of function {}",
1520
0
                                   arguments[0]->get_name(), arguments[1]->get_name(), get_name());
1521
0
        }
1522
7
        return arguments[0];
1523
7
    }
1524
1525
10
    ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; }
1526
1527
12
    Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
1528
12
        if (scope == FunctionContext::THREAD_LOCAL) {
1529
5
            return Status::OK();
1530
5
        }
1531
1532
7
        if (!context->is_col_constant(1)) {
1533
1
            return Status::InvalidArgument(
1534
1
                    "The second argument 'mode' of function {} must be constant", get_name());
1535
1
        }
1536
1537
6
        auto* const_col = context->get_constant_col(1);
1538
6
        auto mode_ref = const_col->column_ptr->get_data_at(0);
1539
6
        std::string lower_mode = doris::to_lower(std::string(doris::trim(mode_ref.to_string())));
1540
1541
6
        UErrorCode status = U_ZERO_ERROR;
1542
6
        const icu::Normalizer2* normalizer = nullptr;
1543
1544
6
        if (lower_mode == "nfc") {
1545
2
            normalizer = icu::Normalizer2::getInstance(nullptr, "nfc", UNORM2_COMPOSE, status);
1546
4
        } else if (lower_mode == "nfd") {
1547
1
            normalizer = icu::Normalizer2::getNFDInstance(status);
1548
3
        } else if (lower_mode == "nfkc") {
1549
0
            normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status);
1550
3
        } else if (lower_mode == "nfkd") {
1551
1
            normalizer = icu::Normalizer2::getNFKDInstance(status);
1552
2
        } else if (lower_mode == "nfkc_cf") {
1553
1
            normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, status);
1554
1
        } else {
1555
1
            return Status::InvalidArgument(
1556
1
                    "Invalid normalization mode '{}' for function {}. "
1557
1
                    "Supported modes: NFC, NFD, NFKC, NFKD, NFKC_CF",
1558
1
                    lower_mode, get_name());
1559
1
        }
1560
1561
5
        if (U_FAILURE(status) || normalizer == nullptr) {
1562
0
            return Status::InvalidArgument(
1563
0
                    "Failed to get normalizer instance for mode '{}' in function {}: {}",
1564
0
                    lower_mode, get_name(), u_errorName(status));
1565
0
        }
1566
1567
5
        auto state = std::make_shared<UnicodeNormalizeState>();
1568
5
        state->normalizer = normalizer;
1569
5
        context->set_function_state(scope, state);
1570
5
        return Status::OK();
1571
5
    }
1572
1573
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1574
5
                        uint32_t result, size_t input_rows_count) const override {
1575
5
        auto* state = reinterpret_cast<UnicodeNormalizeState*>(
1576
5
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
1577
5
        if (state == nullptr || state->normalizer == nullptr) {
1578
0
            return Status::RuntimeError("unicode_normalize function state is not initialized");
1579
0
        }
1580
1581
5
        ColumnPtr col =
1582
5
                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
1583
5
        const auto* col_str = check_and_get_column<ColumnString>(col.get());
1584
5
        if (col_str == nullptr) {
1585
0
            return Status::RuntimeError("Illegal column {} of argument of function {}",
1586
0
                                        block.get_by_position(arguments[0]).column->get_name(),
1587
0
                                        get_name());
1588
0
        }
1589
1590
5
        const auto& data = col_str->get_chars();
1591
5
        const auto& offsets = col_str->get_offsets();
1592
1593
5
        auto res = ColumnString::create();
1594
5
        auto& res_data = res->get_chars();
1595
5
        auto& res_offsets = res->get_offsets();
1596
1597
5
        size_t rows = offsets.size();
1598
5
        res_offsets.resize(rows);
1599
1600
5
        std::string tmp;
1601
10
        for (size_t i = 0; i < rows; ++i) {
1602
5
            const char* begin = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
1603
5
            size_t len = offsets[i] - offsets[i - 1];
1604
1605
5
            normalize_one(state->normalizer, begin, len, tmp);
1606
5
            StringOP::push_value_string(tmp, i, res_data, res_offsets);
1607
5
        }
1608
1609
5
        block.replace_by_position(result, std::move(res));
1610
5
        return Status::OK();
1611
5
    }
1612
1613
private:
1614
    struct UnicodeNormalizeState {
1615
        const icu::Normalizer2* normalizer = nullptr;
1616
    };
1617
1618
    static void normalize_one(const icu::Normalizer2* normalizer, const char* input, size_t length,
1619
5
                              std::string& output) {
1620
5
        if (length == 0) {
1621
0
            output.clear();
1622
0
            return;
1623
0
        }
1624
1625
5
        icu::StringPiece sp(input, static_cast<int32_t>(length));
1626
5
        icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(sp);
1627
1628
5
        UErrorCode status = U_ZERO_ERROR;
1629
5
        UNormalizationCheckResult quick = normalizer->quickCheck(src16, status);
1630
5
        if (U_SUCCESS(status) && quick == UNORM_YES) {
1631
2
            output.assign(input, length);
1632
2
            return;
1633
2
        }
1634
1635
3
        icu::UnicodeString result16;
1636
3
        status = U_ZERO_ERROR;
1637
3
        normalizer->normalize(src16, result16, status);
1638
3
        if (U_FAILURE(status)) {
1639
0
            output.assign(input, length);
1640
0
            return;
1641
0
        }
1642
1643
3
        output.clear();
1644
3
        result16.toUTF8String(output);
1645
3
    }
1646
};
1647
1648
using FunctionMakeSet = FunctionNeedsToHandleNull<MakeSetImpl, PrimitiveType::TYPE_STRING>;
1649
1650
1
void register_function_string_misc(SimpleFunctionFactory& factory) {
1651
1
    factory.register_function<FunctionAutoPartitionName>();
1652
1
    factory.register_function<FunctionConvertTo>();
1653
1
    factory.register_function<FunctionIntToChar>();
1654
1
    factory.register_function<FunctionRandomBytes>();
1655
1
    factory.register_function<FunctionTranslate>();
1656
1
    factory.register_function<FunctionNgramSearch>();
1657
1
    factory.register_function<FunctionXPathString>();
1658
1
    factory.register_function<FunctionCrc32Internal>();
1659
1
    factory.register_function<FunctionMakeSet>();
1660
1
    factory.register_function<FunctionExportSet>();
1661
1
    factory.register_function<FunctionUnicodeNormalize>();
1662
1
}
1663
1664
#include "common/compile_check_avoid_end.h"
1665
} // namespace doris