Coverage Report

Created: 2026-04-11 14:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_string_misc.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <crc32c/crc32c.h>
19
#include <fmt/format.h>
20
#include <glog/logging.h>
21
#include <unicode/normalizer2.h>
22
#include <unicode/stringpiece.h>
23
#include <unicode/unistr.h>
24
25
#include <algorithm>
26
#include <bit>
27
#include <boost/locale.hpp>
28
#include <climits>
29
#include <cstddef>
30
#include <cstdint>
31
#include <cstdlib>
32
#include <cstring>
33
#include <format>
34
#include <iomanip>
35
#include <memory>
36
#include <random>
37
#include <sstream>
38
#include <string>
39
#include <string_view>
40
#include <unordered_map>
41
#include <utility>
42
#include <vector>
43
44
#include "common/compiler_util.h"
45
#include "common/exception.h"
46
#include "common/status.h"
47
#include "core/assert_cast.h"
48
#include "core/block/block.h"
49
#include "core/block/column_numbers.h"
50
#include "core/block/column_with_type_and_name.h"
51
#include "core/column/column.h"
52
#include "core/column/column_const.h"
53
#include "core/column/column_nullable.h"
54
#include "core/column/column_string.h"
55
#include "core/column/column_vector.h"
56
#include "core/data_type/data_type.h"
57
#include "core/data_type/data_type_nullable.h"
58
#include "core/data_type/data_type_number.h"
59
#include "core/data_type/data_type_string.h"
60
#include "core/data_type/define_primitive_type.h"
61
#include "core/memcpy_small.h"
62
#include "core/pod_array.h"
63
#include "core/string_ref.h"
64
#include "core/types.h"
65
#include "exec/common/hash_table/phmap_fwd_decl.h"
66
#include "exec/common/pinyin.h"
67
#include "exec/common/stringop_substring.h"
68
#include "exec/common/template_helpers.hpp"
69
#include "exprs/function/function.h"
70
#include "exprs/function/function_helpers.h"
71
#include "exprs/function/function_needs_to_handle_null.h"
72
#include "exprs/function_context.h"
73
#include "pugixml.hpp"
74
#include "util/hash_util.hpp"
75
#include "util/raw_value.h"
76
#include "util/simd/vstring_function.h"
77
#include "util/string_util.h"
78
#include "util/utf8_check.h"
79
80
#ifndef USE_LIBCPP
81
#include <memory_resource>
82
#define PMR std::pmr
83
#else
84
#include <boost/container/pmr/monotonic_buffer_resource.hpp>
85
#include <boost/container/pmr/vector.hpp>
86
#define PMR boost::container::pmr
87
#endif
88
89
#include "exprs/function/simple_function_factory.h"
90
91
namespace doris {
92
#include "common/compile_check_avoid_begin.h"
93
94
class FunctionAutoPartitionName : public IFunction {
95
public:
96
    static constexpr auto name = "auto_partition_name";
97
70
    static FunctionPtr create() { return std::make_shared<FunctionAutoPartitionName>(); }
98
0
    String get_name() const override { return name; }
99
0
    size_t get_number_of_arguments() const override { return 0; }
100
62
    bool is_variadic() const override { return true; }
101
150
    bool use_default_implementation_for_nulls() const override { return false; }
102
61
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
103
61
        return std::make_shared<DataTypeString>();
104
61
    }
105
106
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
107
89
                        uint32_t result, size_t input_rows_count) const override {
108
89
        size_t argument_size = arguments.size();
109
89
        auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
110
89
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
111
89
        std::vector<const ColumnString::Chars*> chars_list(argument_size);
112
89
        std::vector<const ColumnString::Offsets*> offsets_list(argument_size);
113
89
        std::vector<bool> is_const_args(argument_size);
114
89
        std::vector<const ColumnUInt8::Container*> null_list(argument_size);
115
89
        std::vector<ColumnPtr> argument_null_columns(argument_size);
116
117
89
        std::vector<ColumnPtr> argument_columns(argument_size);
118
350
        for (int i = 0; i < argument_size; ++i) {
119
261
            argument_columns[i] =
120
261
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
121
261
            if (const auto* nullable =
122
261
                        check_and_get_column<const ColumnNullable>(*argument_columns[i])) {
123
27
                null_list[i] = &nullable->get_null_map_data();
124
27
                argument_null_columns[i] = nullable->get_null_map_column_ptr();
125
27
                argument_columns[i] = nullable->get_nested_column_ptr();
126
234
            } else {
127
234
                null_list[i] = &const_null_map->get_data();
128
234
            }
129
130
261
            const auto& [col, is_const] =
131
261
                    unpack_if_const(block.get_by_position(arguments[i]).column);
132
133
261
            const auto* col_str = assert_cast<const ColumnString*>(argument_columns[i].get());
134
261
            chars_list[i] = &col_str->get_chars();
135
261
            offsets_list[i] = &col_str->get_offsets();
136
261
            is_const_args[i] = is_const;
137
261
        }
138
139
89
        auto res = ColumnString::create();
140
89
        auto& res_data = res->get_chars();
141
89
        auto& res_offset = res->get_offsets();
142
89
        res_offset.resize(input_rows_count);
143
144
89
        const char* partition_type = chars_list[0]->raw_data();
145
        // partition type is list|range
146
89
        if (std::strncmp(partition_type, "list", 4) == 0) {
147
43
            return _auto_partition_type_of_list(chars_list, offsets_list, is_const_args, null_list,
148
43
                                                res_data, res_offset, input_rows_count,
149
43
                                                argument_size, block, result, res);
150
46
        } else {
151
46
            return _auto_partition_type_of_range(chars_list, offsets_list, is_const_args, res_data,
152
46
                                                 res_offset, input_rows_count, argument_size, block,
153
46
                                                 result, res);
154
46
        }
155
0
        return Status::OK();
156
89
    }
157
158
private:
159
62
    std::u16string _string_to_u16string(const std::string& str) const {
160
62
        return boost::locale::conv::utf_to_utf<char16_t>(str);
161
62
    }
162
163
62
    std::string _string_to_unicode(const std::u16string& s) const {
164
62
        std::string res_s;
165
62
        res_s.reserve(s.size());
166
62
        if (s.length() > 0 && s[0] == '-') {
167
1
            res_s += '_';
168
1
        }
169
957
        for (int i = 0; i < s.length(); i++) {
170
895
            char16_t ch = s[i];
171
895
            if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) {
172
514
                res_s += ch;
173
514
            } else {
174
381
                int unicodeValue = _get_code_point_at(s, i);
175
381
                res_s += fmt::format("{:02x}", static_cast<uint32_t>(unicodeValue));
176
381
            }
177
895
        }
178
62
        return res_s;
179
62
    }
180
181
381
    int _get_code_point_at(const std::u16string& str, std::size_t index) const {
182
381
        char16_t first = str[index];
183
        // [0xD800,0xDBFF] is the scope of the first code unit
184
381
        if ((first >= 0xD800 && first <= 0xDBFF) && (index + 1 < str.size())) {
185
0
            char16_t second = str[index + 1];
186
            // [0xDC00,0xDFFF] is the scope of the second code unit
187
0
            if (second >= 0xDC00 && second <= 0xDFFF) {
188
0
                return ((first - 0xD800) << 10) + (second - 0xDC00) + 0x10000;
189
0
            }
190
0
        }
191
192
381
        return first;
193
381
    }
194
    Status _auto_partition_type_of_list(std::vector<const ColumnString::Chars*>& chars_list,
195
                                        std::vector<const ColumnString::Offsets*>& offsets_list,
196
                                        std::vector<bool>& is_const_args,
197
                                        const std::vector<const ColumnUInt8::Container*>& null_list,
198
                                        auto& res_data, auto& res_offset, size_t input_rows_count,
199
                                        size_t argument_size, Block& block, uint32_t result,
200
43
                                        auto& res) const {
201
43
        int curr_len = 0;
202
86
        for (int row = 0; row < input_rows_count; row++) {
203
43
            std::string res_p;
204
43
            res_p.reserve(argument_size * 5);
205
43
            res_p += 'p';
206
123
            for (int col = 1; col < argument_size; col++) {
207
80
                const auto& current_offsets = *offsets_list[col];
208
80
                const auto& current_chars = *chars_list[col];
209
80
                const auto& current_nullmap = *null_list[col];
210
211
80
                if (current_nullmap[row]) {
212
18
                    res_p += 'X';
213
62
                } else {
214
62
                    auto idx = index_check_const(row, is_const_args[col]);
215
216
62
                    int size = current_offsets[idx] - current_offsets[idx - 1];
217
62
                    const char* raw_chars =
218
62
                            reinterpret_cast<const char*>(&current_chars[current_offsets[idx - 1]]);
219
                    // convert string to u16string in order to convert to unicode strings
220
62
                    const std::string raw_str(raw_chars, size);
221
62
                    auto u16string = _string_to_u16string(raw_str);
222
62
                    res_p += _string_to_unicode(u16string) + std::to_string(u16string.size());
223
62
                }
224
80
            }
225
226
            // check the name of length
227
43
            int len = res_p.size();
228
43
            if (len > 50) {
229
7
                res_p = std::format("{}_{:08x}", res_p.substr(0, 50), to_hash_code(res_p));
230
7
                len = res_p.size();
231
7
            }
232
43
            curr_len += len;
233
43
            res_data.resize(curr_len);
234
43
            memcpy(&res_data[res_offset[row - 1]], res_p.c_str(), len);
235
43
            res_offset[row] = res_offset[row - 1] + len;
236
43
        }
237
43
        block.get_by_position(result).column = std::move(res);
238
43
        return Status::OK();
239
43
    }
240
241
    size_t _copy_date_str_of_len_to_res_data(auto& res_data, auto& res_offset,
242
                                             std::vector<std::string>& date_str, size_t row,
243
88
                                             size_t len) const {
244
88
        size_t curr_len = 1;
245
377
        for (int j = 0; j < len; j++) {
246
289
            memcpy(&res_data[res_offset[row - 1]] + curr_len, date_str[j].c_str(),
247
289
                   date_str[j].size());
248
289
            curr_len += date_str[j].size();
249
289
        }
250
88
        return curr_len;
251
88
    }
252
253
    Status _auto_partition_type_of_range(std::vector<const ColumnString::Chars*>& chars_list,
254
                                         std::vector<const ColumnString::Offsets*>& offsets_list,
255
                                         std::vector<bool>& is_const_args, auto& res_data,
256
                                         auto& res_offset, size_t input_rows_count,
257
                                         size_t argument_size, Block& block, uint32_t result,
258
46
                                         auto& res) const {
259
46
        const char* range_type = chars_list[1]->raw_data();
260
261
46
        res_data.resize(15 * input_rows_count);
262
134
        for (int i = 0; i < input_rows_count; i++) {
263
94
            const auto& current_offsets = *offsets_list[2];
264
94
            const auto& current_chars = *chars_list[2];
265
266
94
            auto idx = index_check_const(i, is_const_args[2]);
267
94
            int size = current_offsets[idx] - current_offsets[idx - 1];
268
94
            const char* tmp =
269
94
                    reinterpret_cast<const char*>(&current_chars[current_offsets[idx - 1]]);
270
94
            std::string to_split_s(tmp, size);
271
272
            // check the str if it is date|datetime
273
94
            RE2 date_regex(R"(^\d{4}-\d{2}-\d{2}( \d{2}:\d{2}:\d{2})?$)");
274
94
            if (!RE2::FullMatch(to_split_s, date_regex)) {
275
6
                return Status::InvalidArgument("The range partition only support DATE|DATETIME");
276
6
            }
277
278
            // split date_str from (yyyy-mm-dd hh:mm:ss) to ([yyyy, mm, dd, hh, mm, ss])
279
88
            std::vector<std::string> date_str(6);
280
88
            date_str[0] = to_split_s.substr(0, 4);
281
300
            for (int ni = 5, j = 1; ni <= size; ni += 3, j++) {
282
212
                date_str[j] = to_split_s.substr(ni, 2);
283
212
            }
284
88
            int curr_len = 0;
285
286
88
            res_data[res_offset[i - 1]] = 'p';
287
            // raw => 2022-12-12 11:30:20
288
            // year => 2022 01 01 00 00 00
289
            // month => 2022 12 01 00 00 00
290
            // day => 2022 12 12 00 00 00
291
            // hour => 2022 12 12 11 00 00
292
            // minute => 2022 12  11 30 00
293
            // second => 2022 12 12 12 30 20
294
295
88
            if (!strncmp(range_type, "year", 4)) {
296
17
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 1);
297
17
                memcpy(&res_data[res_offset[i - 1]] + curr_len, "0101", 4);
298
17
                curr_len += 4;
299
71
            } else if (!strncmp(range_type, "month", 5)) {
300
16
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 2);
301
16
                memcpy(&res_data[res_offset[i - 1]] + curr_len, "01", 2);
302
16
                curr_len += 2;
303
55
            } else if (!strncmp(range_type, "day", 3)) {
304
16
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 3);
305
39
            } else if (!strncmp(range_type, "hour", 4)) {
306
13
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 4);
307
26
            } else if (!strncmp(range_type, "minute", 6)) {
308
13
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 5);
309
13
            } else if (!strncmp(range_type, "second", 6)) {
310
13
                curr_len += _copy_date_str_of_len_to_res_data(res_data, res_offset, date_str, i, 6);
311
13
            }
312
313
            // fill in zero
314
88
            int zero = 15 - curr_len;
315
88
            std::fill_n(&res_data[res_offset[i - 1]] + curr_len, zero, '0');
316
88
            curr_len += zero;
317
88
            res_offset[i] = res_offset[i - 1] + curr_len;
318
88
        }
319
40
        block.get_by_position(result).column = std::move(res);
320
40
        return Status::OK();
321
46
    }
322
323
7
    int32_t to_hash_code(const std::string& str) const {
324
7
        uint64_t h = 0;
325
1.37k
        for (uint8_t c : str) {
326
1.37k
            h = (h * 31U + c) & 0xFFFFFFFFU;
327
1.37k
        }
328
7
        return static_cast<int32_t>(h);
329
7
    }
330
};
331
332
class FunctionRandomBytes : public IFunction {
333
public:
334
    static constexpr auto name = "random_bytes";
335
14
    static FunctionPtr create() { return std::make_shared<FunctionRandomBytes>(); }
336
1
    String get_name() const override { return name; }
337
5
    size_t get_number_of_arguments() const override { return 1; }
338
6
    bool is_variadic() const override { return false; }
339
340
5
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
341
5
        return std::make_shared<DataTypeString>();
342
5
    }
343
344
15
    bool use_default_implementation_for_constants() const final { return false; }
345
346
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
347
4
                        uint32_t result, size_t input_rows_count) const override {
348
4
        auto res = ColumnString::create();
349
4
        auto& res_offsets = res->get_offsets();
350
4
        auto& res_chars = res->get_chars();
351
4
        res_offsets.resize(input_rows_count);
352
353
4
        auto [arg_col, arg_const] = unpack_if_const(block.get_by_position(arguments[0]).column);
354
4
        const auto* length_col = assert_cast<const ColumnInt32*>(arg_col.get());
355
356
4
        if (arg_const) {
357
3
            res_chars.reserve(input_rows_count * (length_col->get_element(0) + 2));
358
3
        }
359
360
4
        std::vector<uint8_t, Allocator_<uint8_t>> random_bytes;
361
4
        std::random_device rd;
362
4
        std::mt19937 gen(rd());
363
364
4
        std::uniform_int_distribution<unsigned short> distribution(0, 255);
365
19
        for (size_t i = 0; i < input_rows_count; ++i) {
366
16
            size_t index = index_check_const(i, arg_const);
367
16
            if (length_col->get_element(index) < 0) [[unlikely]] {
368
1
                return Status::InvalidArgument("argument {} of function {} at row {} was invalid.",
369
1
                                               length_col->get_element(index), name, index);
370
1
            }
371
15
            random_bytes.resize(length_col->get_element(index));
372
373
117
            for (auto& byte : random_bytes) {
374
117
                byte = distribution(gen) & 0xFF;
375
117
            }
376
377
15
            std::basic_ostringstream<char, std::char_traits<char>, Allocator_<char>> oss;
378
117
            for (const auto& byte : random_bytes) {
379
117
                oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(byte);
380
117
            }
381
382
15
            StringOP::push_value_string("0x" + oss.str(), i, res_chars, res_offsets);
383
15
            random_bytes.clear();
384
15
        }
385
386
3
        block.get_by_position(result).column = std::move(res);
387
388
3
        return Status::OK();
389
4
    }
390
};
391
392
class FunctionConvertTo : public IFunction {
393
public:
394
    static constexpr auto name = "convert_to";
395
396
15
    static FunctionPtr create() { return std::make_shared<FunctionConvertTo>(); }
397
398
1
    String get_name() const override { return name; }
399
400
6
    size_t get_number_of_arguments() const override { return 2; }
401
402
6
    DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override {
403
6
        return std::make_shared<DataTypeString>();
404
6
    }
405
406
29
    Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
407
29
        if (scope != FunctionContext::THREAD_LOCAL) {
408
6
            return Status::OK();
409
6
        }
410
23
        if (!context->is_col_constant(1)) {
411
0
            return Status::InvalidArgument(
412
0
                    "character argument to convert function must be constant.");
413
0
        }
414
23
        const auto& character_data = context->get_constant_col(1)->column_ptr->get_data_at(0);
415
23
        if (!iequal(character_data.to_string(), "gbk")) {
416
0
            return Status::RuntimeError(
417
0
                    "Illegal second argument column of function convert. now only support "
418
0
                    "convert to character set of gbk");
419
0
        }
420
421
23
        return Status::OK();
422
23
    }
423
424
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
425
14
                        uint32_t result, size_t input_rows_count) const override {
426
14
        ColumnPtr argument_column =
427
14
                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
428
14
        const ColumnString* str_col = static_cast<const ColumnString*>(argument_column.get());
429
14
        const auto& str_offset = str_col->get_offsets();
430
14
        const auto& str_chars = str_col->get_chars();
431
14
        auto col_res = ColumnString::create();
432
14
        auto& res_offset = col_res->get_offsets();
433
14
        auto& res_chars = col_res->get_chars();
434
14
        res_offset.resize(input_rows_count);
435
        // max pinyin size is 6 + 1 (first '~') for utf8 chinese word 3
436
14
        size_t pinyin_size = (str_chars.size() + 2) / 3 * 7;
437
14
        ColumnString::check_chars_length(pinyin_size, 0);
438
14
        res_chars.resize(pinyin_size);
439
440
14
        size_t in_len = 0, out_len = 0;
441
49
        for (int i = 0; i < input_rows_count; ++i) {
442
35
            in_len = str_offset[i] - str_offset[i - 1];
443
35
            const char* in = reinterpret_cast<const char*>(&str_chars[str_offset[i - 1]]);
444
35
            char* out = reinterpret_cast<char*>(&res_chars[res_offset[i - 1]]);
445
35
            _utf8_to_pinyin(in, in_len, out, &out_len);
446
35
            res_offset[i] = res_offset[i - 1] + out_len;
447
35
        }
448
14
        res_chars.resize(res_offset[input_rows_count - 1]);
449
14
        block.replace_by_position(result, std::move(col_res));
450
14
        return Status::OK();
451
14
    }
452
453
35
    void _utf8_to_pinyin(const char* in, size_t in_len, char* out, size_t* out_len) const {
454
225
        auto do_memcpy = [](char*& dest, const char*& from, size_t size) {
455
225
            memcpy_small_allow_read_write_overflow15(dest, from, size);
456
225
            dest += size;
457
225
            from += size;
458
225
        };
459
35
        auto from = in;
460
35
        auto dest = out;
461
462
273
        while (from - in < in_len) {
463
238
            auto length = get_utf8_byte_length(*from);
464
238
            if (length != 3) {
465
225
                do_memcpy(dest, from, length);
466
225
            } else {
467
                // convert utf8 to unicode code to get pinyin offset
468
13
                if (auto tmp = (((int)(*from & 0x0F)) << 12) | (((int)(*(from + 1) & 0x3F)) << 6) |
469
13
                               (*(from + 2) & 0x3F);
470
13
                    tmp >= START_UNICODE_OFFSET and tmp < END_UNICODE_OFFSET) {
471
13
                    const char* buf = nullptr;
472
13
                    if (tmp >= START_UNICODE_OFFSET && tmp < MID_UNICODE_OFFSET) {
473
2
                        buf = PINYIN_DICT1 + (tmp - START_UNICODE_OFFSET) * MAX_PINYIN_LEN;
474
11
                    } else if (tmp >= MID_UNICODE_OFFSET && tmp < END_UNICODE_OFFSET) {
475
11
                        buf = PINYIN_DICT2 + (tmp - MID_UNICODE_OFFSET) * MAX_PINYIN_LEN;
476
11
                    }
477
478
13
                    auto end = strchr(buf, ' ');
479
                    // max len for pinyin is 6
480
13
                    int len = MAX_PINYIN_LEN;
481
13
                    if (end != nullptr && end - buf < MAX_PINYIN_LEN) {
482
3
                        len = end - buf;
483
3
                    }
484
                    // set first char '~' just make sure all english word lower than chinese word
485
13
                    *dest = 126;
486
13
                    memcpy(dest + 1, buf, len);
487
13
                    dest += (len + 1);
488
13
                    from += 3;
489
13
                } else {
490
0
                    do_memcpy(dest, from, 3);
491
0
                }
492
13
            }
493
238
        }
494
495
35
        *out_len = dest - out;
496
35
    }
497
};
498
// +-----------------------------------+
499
// | 丝                                |
500
// +-----------------------------------+
501
// 1 row in set, 1 warning (0.00 sec)
502
// mysql> select char(14989469 using utf8);
503
// +---------------------------+
504
// | char(14989469 using utf8) |
505
// +---------------------------+
506
// | 丝                        |
507
// +---------------------------+
508
// 1 row in set, 1 warning (0.00 sec)
509
// mysql> select char(0xe5, 0xa4, 0x9a, 0xe7, 0x9d, 0xbf, 0xe4, 0xb8, 0x9d, 68, 111, 114, 105, 115 using utf8);
510
// +---------------------------------------------------------------------------------------------+
511
// | char(0xe5, 0xa4, 0x9a, 0xe7, 0x9d, 0xbf, 0xe4, 0xb8, 0x9d, 68, 111, 114, 105, 115 using utf8) |
512
// +---------------------------------------------------------------------------------------------+
513
// | 多睿丝 Doris                                                                                 |
514
// +---------------------------------------------------------------------------------------------+
515
// mysql> select char(68, 111, 114, 0, 105, null, 115 using utf8);
516
// +--------------------------------------------------+
517
// | char(68, 111, 114, 0, 105, null, 115 using utf8) |
518
// +--------------------------------------------------+
519
// | Dor is                                           |
520
// +--------------------------------------------------+
521
522
// return null:
523
// mysql>  select char(255 using utf8);
524
// +----------------------+
525
// | char(255 using utf8) |
526
// +----------------------+
527
// | NULL                 |
528
// +----------------------+
529
// 1 row in set, 2 warnings (0.00 sec)
530
//
531
// mysql> show warnings;
532
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
533
// | Level   | Code | Message                                                                                                                                                                     |
534
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
535
// | Warning | 3719 | 'utf8' is currently an alias for the character set UTF8MB3, but will be an alias for UTF8MB4 in a future release. Please consider using UTF8MB4 in order to be unambiguous. |
536
// | Warning | 1300 | Invalid utf8mb3 character string: 'FF'                                                                                                                                      |
537
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
538
// 2 rows in set (0.01 sec)
539
540
// max int value:
541
// mysql> select char(18446744073709551615);
542
// +--------------------------------------------------------+
543
// | char(18446744073709551615)                             |
544
// +--------------------------------------------------------+
545
// | 0xFFFFFFFF                                             |
546
// +--------------------------------------------------------+
547
// 1 row in set (0.00 sec)
548
//
549
// mysql> select char(18446744073709551616);
550
// +--------------------------------------------------------+
551
// | char(18446744073709551616)                             |
552
// +--------------------------------------------------------+
553
// | 0xFFFFFFFF                                             |
554
// +--------------------------------------------------------+
555
// 1 row in set, 1 warning (0.00 sec)
556
//
557
// mysql> show warnings;
558
// +---------+------+-----------------------------------------------------------+
559
// | Level   | Code | Message                                                   |
560
// +---------+------+-----------------------------------------------------------+
561
// | Warning | 1292 | Truncated incorrect DECIMAL value: '18446744073709551616' |
562
// +---------+------+-----------------------------------------------------------+
563
// 1 row in set (0.00 sec)
564
565
// table columns:
566
// mysql> select * from t;
567
// +------+------+------+
568
// | f1   | f2   | f3   |
569
// +------+------+------+
570
// |  228 |  184 |  157 |
571
// |  228 |  184 |    0 |
572
// |  228 |  184 |   99 |
573
// |   99 |  228 |  184 |
574
// +------+------+------+
575
// 4 rows in set (0.00 sec)
576
//
577
// mysql> select char(f1, f2, f3 using utf8) from t;
578
// +-----------------------------+
579
// | char(f1, f2, f3 using utf8) |
580
// +-----------------------------+
581
// | 丝                          |
582
// |                             |
583
// |                             |
584
// | c                           |
585
// +-----------------------------+
586
// 4 rows in set, 4 warnings (0.00 sec)
587
//
588
// mysql> show warnings;
589
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
590
// | Level   | Code | Message                                                                                                                                                                     |
591
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
592
// | Warning | 3719 | 'utf8' is currently an alias for the character set UTF8MB3, but will be an alias for UTF8MB4 in a future release. Please consider using UTF8MB4 in order to be unambiguous. |
593
// | Warning | 1300 | Invalid utf8mb3 character string: 'E4B800'                                                                                                                                  |
594
// | Warning | 1300 | Invalid utf8mb3 character string: 'E4B863'                                                                                                                                  |
595
// | Warning | 1300 | Invalid utf8mb3 character string: 'E4B8'                                                                                                                                    |
596
// +---------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
597
class FunctionIntToChar : public IFunction {
598
public:
599
    static constexpr auto name = "char";
600
320
    static FunctionPtr create() { return std::make_shared<FunctionIntToChar>(); }
601
0
    String get_name() const override { return name; }
602
0
    size_t get_number_of_arguments() const override { return 0; }
603
312
    bool is_variadic() const override { return true; }
604
605
311
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
606
311
        return make_nullable(std::make_shared<DataTypeString>());
607
311
    }
608
622
    bool use_default_implementation_for_nulls() const override { return false; }
609
610
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
611
311
                        uint32_t result, size_t input_rows_count) const override {
612
311
        DCHECK_GE(arguments.size(), 2);
613
614
311
        int argument_size = arguments.size();
615
311
        std::vector<ColumnPtr> str_columns(argument_size - 1);
616
311
        std::vector<const ColumnString::Offsets*> offsets_list(argument_size - 1);
617
311
        std::vector<const ColumnString::Chars*> chars_list(argument_size - 1);
618
619
        // convert each argument columns to column string and then concat the string columns
620
701
        for (size_t i = 1; i < argument_size; ++i) {
621
390
            if (auto const_column = check_and_get_column<const ColumnConst>(
622
390
                        *block.get_by_position(arguments[i]).column)) {
623
                // ignore null
624
4
                if (const_column->only_null()) {
625
0
                    str_columns[i - 1] = nullptr;
626
4
                } else {
627
4
                    auto str_column = ColumnString::create();
628
4
                    auto& chars = str_column->get_chars();
629
4
                    auto& offsets = str_column->get_offsets();
630
4
                    offsets.resize(1);
631
4
                    const ColumnInt32* int_column;
632
4
                    if (auto* nullable = check_and_get_column<const ColumnNullable>(
633
4
                                const_column->get_data_column())) {
634
0
                        int_column = assert_cast<const ColumnInt32*>(
635
0
                                nullable->get_nested_column_ptr().get());
636
4
                    } else {
637
4
                        int_column =
638
4
                                assert_cast<const ColumnInt32*>(&const_column->get_data_column());
639
4
                    }
640
4
                    int int_val = int_column->get_int(0);
641
4
                    integer_to_char_(0, &int_val, chars, offsets);
642
4
                    str_columns[i - 1] =
643
4
                            ColumnConst::create(std::move(str_column), input_rows_count);
644
4
                }
645
4
                offsets_list[i - 1] = nullptr;
646
4
                chars_list[i - 1] = nullptr;
647
386
            } else {
648
386
                auto str_column = ColumnString::create();
649
386
                auto& chars = str_column->get_chars();
650
386
                auto& offsets = str_column->get_offsets();
651
                // data.resize(input_rows_count);
652
386
                offsets.resize(input_rows_count);
653
654
386
                if (auto nullable = check_and_get_column<const ColumnNullable>(
655
386
                            *block.get_by_position(arguments[i]).column)) {
656
23
                    const auto* int_data =
657
23
                            assert_cast<const ColumnInt32*>(nullable->get_nested_column_ptr().get())
658
23
                                    ->get_data()
659
23
                                    .data();
660
23
                    const auto* null_map_data = nullable->get_null_map_data().data();
661
148
                    for (size_t j = 0; j < input_rows_count; ++j) {
662
                        // ignore null
663
125
                        if (null_map_data[j]) {
664
23
                            offsets[j] = offsets[j - 1];
665
102
                        } else {
666
102
                            integer_to_char_(j, int_data + j, chars, offsets);
667
102
                        }
668
125
                    }
669
363
                } else {
670
363
                    const auto* int_data = assert_cast<const ColumnInt32*>(
671
363
                                                   block.get_by_position(arguments[i]).column.get())
672
363
                                                   ->get_data()
673
363
                                                   .data();
674
770
                    for (size_t j = 0; j < input_rows_count; ++j) {
675
407
                        integer_to_char_(j, int_data + j, chars, offsets);
676
407
                    }
677
363
                }
678
386
                offsets_list[i - 1] = &str_column->get_offsets();
679
386
                chars_list[i - 1] = &str_column->get_chars();
680
386
                str_columns[i - 1] = std::move(str_column);
681
386
            }
682
390
        }
683
684
311
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
685
311
        auto res = ColumnString::create();
686
311
        auto& res_data = res->get_chars();
687
311
        auto& res_offset = res->get_offsets();
688
689
311
        size_t res_reserve_size = 0;
690
701
        for (size_t i = 0; i < argument_size - 1; ++i) {
691
390
            if (!str_columns[i]) {
692
0
                continue;
693
0
            }
694
390
            if (auto const_column = check_and_get_column<const ColumnConst>(*str_columns[i])) {
695
4
                auto str_column =
696
4
                        assert_cast<const ColumnString*>(&(const_column->get_data_column()));
697
4
                auto& offsets = str_column->get_offsets();
698
4
                res_reserve_size += (offsets[0] - offsets[-1]) * input_rows_count;
699
386
            } else {
700
918
                for (size_t j = 0; j < input_rows_count; ++j) {
701
532
                    size_t append = (*offsets_list[i])[j] - (*offsets_list[i])[j - 1];
702
                    // check whether the output might overflow(unlikely)
703
532
                    if (UNLIKELY(UINT_MAX - append < res_reserve_size)) {
704
0
                        return Status::BufferAllocFailed(
705
0
                                "function char output is too large to allocate");
706
0
                    }
707
532
                    res_reserve_size += append;
708
532
                }
709
386
            }
710
390
        }
711
311
        if ((UNLIKELY(UINT_MAX - input_rows_count < res_reserve_size))) {
712
0
            return Status::BufferAllocFailed("function char output is too large to allocate");
713
0
        }
714
311
        ColumnString::check_chars_length(res_reserve_size, 0);
715
311
        res_data.resize(res_reserve_size);
716
311
        res_offset.resize(input_rows_count);
717
718
666
        for (size_t i = 0; i < input_rows_count; ++i) {
719
355
            int current_length = 0;
720
915
            for (size_t j = 0; j < argument_size - 1; ++j) {
721
560
                if (!str_columns[j]) {
722
0
                    continue;
723
0
                }
724
560
                if (auto const_column = check_and_get_column<const ColumnConst>(*str_columns[j])) {
725
28
                    auto str_column = assert_cast<const ColumnString*, TypeCheckOnRelease::DISABLE>(
726
28
                            &(const_column->get_data_column()));
727
28
                    auto data_item = str_column->get_data_at(0);
728
28
                    memcpy_small_allow_read_write_overflow15(
729
28
                            &res_data[res_offset[i - 1]] + current_length, data_item.data,
730
28
                            data_item.size);
731
28
                    current_length += data_item.size;
732
532
                } else {
733
532
                    auto& current_offsets = *offsets_list[j];
734
532
                    auto& current_chars = *chars_list[j];
735
736
532
                    int size = current_offsets[i] - current_offsets[i - 1];
737
532
                    if (size > 0) {
738
509
                        memcpy_small_allow_read_write_overflow15(
739
509
                                &res_data[res_offset[i - 1]] + current_length,
740
509
                                &current_chars[current_offsets[i - 1]], size);
741
509
                        current_length += size;
742
509
                    }
743
532
                }
744
560
            }
745
355
            res_offset[i] = res_offset[i - 1] + current_length;
746
355
        }
747
748
        // validate utf8
749
311
        auto* null_map_data = null_map->get_data().data();
750
666
        for (size_t i = 0; i < input_rows_count; ++i) {
751
355
            if (!validate_utf8((const char*)(&res_data[res_offset[i - 1]]),
752
355
                               res_offset[i] - res_offset[i - 1])) {
753
136
                null_map_data[i] = 1;
754
136
            }
755
355
        }
756
757
311
        block.get_by_position(result).column =
758
311
                ColumnNullable::create(std::move(res), std::move(null_map));
759
311
        return Status::OK();
760
311
    }
761
762
private:
763
    void integer_to_char_(int line_num, const int* num, ColumnString::Chars& chars,
764
513
                          IColumn::Offsets& offsets) const {
765
513
        if (0 == *num) {
766
26
            chars.push_back('\0');
767
26
            offsets[line_num] = offsets[line_num - 1] + 1;
768
26
            return;
769
26
        }
770
487
        const char* bytes = (const char*)(num);
771
487
        if constexpr (std::endian::native == std::endian::little) {
772
487
            int k = 3;
773
1.87k
            for (; k >= 0; --k) {
774
1.87k
                if (bytes[k]) {
775
487
                    break;
776
487
                }
777
1.87k
            }
778
487
            offsets[line_num] = offsets[line_num - 1] + k + 1;
779
1.05k
            for (; k >= 0; --k) {
780
565
                chars.push_back(bytes[k] ? bytes[k] : '\0');
781
565
            }
782
        } else if constexpr (std::endian::native == std::endian::big) {
783
            int k = 0;
784
            for (; k < 4; ++k) {
785
                if (bytes[k]) {
786
                    break;
787
                }
788
            }
789
            offsets[line_num] = offsets[line_num - 1] + 4 - k;
790
            for (; k < 4; ++k) {
791
                chars.push_back(bytes[k] ? bytes[k] : '\0');
792
            }
793
        } else {
794
            static_assert(std::endian::native == std::endian::big ||
795
                                  std::endian::native == std::endian::little,
796
                          "Unsupported endianness");
797
        }
798
487
    }
799
};
800
801
class FunctionNgramSearch : public IFunction {
802
public:
803
    static constexpr auto name = "ngram_search";
804
24
    static FunctionPtr create() { return std::make_shared<FunctionNgramSearch>(); }
805
1
    String get_name() const override { return name; }
806
15
    size_t get_number_of_arguments() const override { return 3; }
807
15
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
808
15
        return std::make_shared<DataTypeFloat64>();
809
15
    }
810
811
    // ngram_search(text,pattern,gram_num)
812
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
813
14
                        uint32_t result, size_t input_rows_count) const override {
814
14
        CHECK_EQ(arguments.size(), 3);
815
14
        auto col_res = ColumnFloat64::create();
816
14
        bool col_const[3];
817
14
        ColumnPtr argument_columns[3];
818
56
        for (int i = 0; i < 3; ++i) {
819
42
            std::tie(argument_columns[i], col_const[i]) =
820
42
                    unpack_if_const(block.get_by_position(arguments[i]).column);
821
42
        }
822
14
        auto pattern = assert_cast<const ColumnString*>(argument_columns[1].get())->get_data_at(0);
823
14
        auto gram_num = assert_cast<const ColumnInt32*>(argument_columns[2].get())->get_element(0);
824
14
        const auto* text_col = assert_cast<const ColumnString*>(argument_columns[0].get());
825
826
14
        if (col_const[0]) {
827
0
            _execute_impl<true>(text_col, pattern, gram_num, *col_res, input_rows_count);
828
14
        } else {
829
14
            _execute_impl<false>(text_col, pattern, gram_num, *col_res, input_rows_count);
830
14
        }
831
832
14
        block.replace_by_position(result, std::move(col_res));
833
14
        return Status::OK();
834
14
    }
835
836
private:
837
    using NgramMap = phmap::flat_hash_map<uint32_t, uint8_t>;
838
    constexpr static auto not_found = 0b00;
839
    constexpr static auto found_in_pattern = 0b01;
840
    constexpr static auto found_in_text = 0b10;
841
    constexpr static auto found_in_pattern_and_text = 0b11;
842
843
173
    uint32_t sub_str_hash(const char* data, int32_t length) const {
844
173
        constexpr static uint32_t seed = 0;
845
173
        return crc32c::Extend(seed, (const uint8_t*)data, length);
846
173
    }
847
848
    template <bool column_const>
849
    void _execute_impl(const ColumnString* text_col, StringRef& pattern, int gram_num,
850
14
                       ColumnFloat64& res, size_t size) const {
851
14
        auto& res_data = res.get_data();
852
14
        res_data.resize_fill(size, 0);
853
        // If the length of the pattern is less than gram_num, return 0.
854
14
        if (pattern.size < gram_num) {
855
0
            return;
856
0
        }
857
858
        // Build a map by pattern string, which will be used repeatedly in the following loop.
859
14
        NgramMap pattern_map;
860
14
        int pattern_count = get_pattern_set(pattern_map, pattern, gram_num);
861
        // Each time a loop is executed, the map will be modified, so it needs to be restored afterward.
862
14
        std::vector<uint32_t> restore_map;
863
864
35
        for (int i = 0; i < size; i++) {
865
21
            auto text = text_col->get_data_at(index_check_const<column_const>(i));
866
21
            if (text.size < gram_num) {
867
                // If the length of the text is less than gram_num, return 0.
868
4
                continue;
869
4
            }
870
17
            restore_map.reserve(text.size);
871
17
            auto [text_count, intersection_count] =
872
17
                    get_text_set(text, gram_num, pattern_map, restore_map);
873
874
            // 2 * |Intersection| / (|text substr set| + |pattern substr set|)
875
17
            res_data[i] = 2.0 * intersection_count / (text_count + pattern_count);
876
17
        }
877
14
    }
Unexecuted instantiation: _ZNK5doris19FunctionNgramSearch13_execute_implILb1EEEvPKNS_9ColumnStrIjEERNS_9StringRefEiRNS_12ColumnVectorILNS_13PrimitiveTypeE9EEEm
_ZNK5doris19FunctionNgramSearch13_execute_implILb0EEEvPKNS_9ColumnStrIjEERNS_9StringRefEiRNS_12ColumnVectorILNS_13PrimitiveTypeE9EEEm
Line
Count
Source
850
14
                       ColumnFloat64& res, size_t size) const {
851
14
        auto& res_data = res.get_data();
852
14
        res_data.resize_fill(size, 0);
853
        // If the length of the pattern is less than gram_num, return 0.
854
14
        if (pattern.size < gram_num) {
855
0
            return;
856
0
        }
857
858
        // Build a map by pattern string, which will be used repeatedly in the following loop.
859
14
        NgramMap pattern_map;
860
14
        int pattern_count = get_pattern_set(pattern_map, pattern, gram_num);
861
        // Each time a loop is executed, the map will be modified, so it needs to be restored afterward.
862
14
        std::vector<uint32_t> restore_map;
863
864
35
        for (int i = 0; i < size; i++) {
865
21
            auto text = text_col->get_data_at(index_check_const<column_const>(i));
866
21
            if (text.size < gram_num) {
867
                // If the length of the text is less than gram_num, return 0.
868
4
                continue;
869
4
            }
870
17
            restore_map.reserve(text.size);
871
17
            auto [text_count, intersection_count] =
872
17
                    get_text_set(text, gram_num, pattern_map, restore_map);
873
874
            // 2 * |Intersection| / (|text substr set| + |pattern substr set|)
875
17
            res_data[i] = 2.0 * intersection_count / (text_count + pattern_count);
876
17
        }
877
14
    }
878
879
14
    size_t get_pattern_set(NgramMap& pattern_map, StringRef& pattern, int gram_num) const {
880
14
        size_t pattern_count = 0;
881
87
        for (int i = 0; i + gram_num <= pattern.size; i++) {
882
73
            uint32_t cur_hash = sub_str_hash(pattern.data + i, gram_num);
883
73
            if (!pattern_map.contains(cur_hash)) {
884
43
                pattern_map[cur_hash] = found_in_pattern;
885
43
                pattern_count++;
886
43
            }
887
73
        }
888
14
        return pattern_count;
889
14
    }
890
891
    std::pair<size_t, size_t> get_text_set(StringRef& text, int gram_num, NgramMap& pattern_map,
892
17
                                           std::vector<uint32_t>& restore_map) const {
893
17
        restore_map.clear();
894
        //intersection_count indicates a substring both in pattern and text.
895
17
        size_t text_count = 0, intersection_count = 0;
896
117
        for (int i = 0; i + gram_num <= text.size; i++) {
897
100
            uint32_t cur_hash = sub_str_hash(text.data + i, gram_num);
898
100
            auto& val = pattern_map[cur_hash];
899
100
            if (val == not_found) {
900
26
                val ^= found_in_text;
901
26
                DCHECK(val == found_in_text);
902
                // only found in text
903
26
                text_count++;
904
26
                restore_map.push_back(cur_hash);
905
74
            } else if (val == found_in_pattern) {
906
39
                val ^= found_in_text;
907
39
                DCHECK(val == found_in_pattern_and_text);
908
                // found in text and pattern
909
39
                text_count++;
910
39
                intersection_count++;
911
39
                restore_map.push_back(cur_hash);
912
39
            }
913
100
        }
914
        // Restore the pattern_map.
915
65
        for (auto& restore_hash : restore_map) {
916
65
            pattern_map[restore_hash] ^= found_in_text;
917
65
        }
918
919
17
        return {text_count, intersection_count};
920
17
    }
921
};
922
923
class FunctionTranslate : public IFunction {
924
public:
925
    static constexpr auto name = "translate";
926
    using AsciiMap = std::array<UInt8, 128>;
927
    constexpr static UInt8 DELETE_CHAR = 255; // 255 means delete this char
928
100
    static FunctionPtr create() { return std::make_shared<FunctionTranslate>(); }
929
1
    String get_name() const override { return name; }
930
91
    size_t get_number_of_arguments() const override { return 3; }
931
932
91
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
933
91
        return std::make_shared<DataTypeString>();
934
91
    };
935
936
8
    DataTypes get_variadic_argument_types_impl() const override {
937
8
        return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(),
938
8
                std::make_shared<DataTypeString>()};
939
8
    }
940
941
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
942
175
                        uint32_t result, size_t input_rows_count) const override {
943
175
        CHECK_EQ(arguments.size(), 3);
944
175
        auto col_res = ColumnString::create();
945
175
        bool col_const[3];
946
175
        ColumnPtr argument_columns[3];
947
700
        for (int i = 0; i < 3; ++i) {
948
525
            col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column);
949
525
        }
950
175
        argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>(
951
20
                                                     *block.get_by_position(arguments[0]).column)
952
20
                                                     .convert_to_full_column()
953
175
                                           : block.get_by_position(arguments[0]).column;
954
175
        default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments);
955
956
175
        const auto* col_source = assert_cast<const ColumnString*>(argument_columns[0].get());
957
175
        const auto* col_from = assert_cast<const ColumnString*>(argument_columns[1].get());
958
175
        const auto* col_to = assert_cast<const ColumnString*>(argument_columns[2].get());
959
960
175
        bool is_ascii = col_source->is_ascii() && col_from->is_ascii() && col_to->is_ascii();
961
175
        auto impl_vectors = impl_vectors_utf8<false>;
962
175
        if (col_const[1] && col_const[2] && is_ascii) {
963
34
            impl_vectors = impl_vectors_ascii<true>;
964
141
        } else if (col_const[1] && col_const[2]) {
965
1
            impl_vectors = impl_vectors_utf8<true>;
966
140
        } else if (is_ascii) {
967
88
            impl_vectors = impl_vectors_ascii<false>;
968
88
        }
969
175
        impl_vectors(col_source, col_from, col_to, col_res.get());
970
175
        block.get_by_position(result).column = std::move(col_res);
971
175
        return Status::OK();
972
175
    }
973
974
private:
975
    template <bool IsConst>
976
    static void impl_vectors_ascii(const ColumnString* col_source, const ColumnString* col_from,
977
122
                                   const ColumnString* col_to, ColumnString* col_res) {
978
122
        auto& res_chars = col_res->get_chars();
979
122
        auto& res_offsets = col_res->get_offsets();
980
122
        res_chars.reserve(col_source->get_chars().size());
981
122
        res_offsets.reserve(col_source->get_offsets().size());
982
122
        DCHECK_EQ(col_res->size(), 0);
983
122
        AsciiMap map;
984
122
        if (IsConst) {
985
34
            const auto& from_str = col_from->get_data_at(0);
986
34
            const auto& to_str = col_to->get_data_at(0);
987
34
            if (!build_translate_map_ascii(map, from_str, to_str)) {
988
                // if the map is not need delete char, we can directly copy the source string,then use map to translate
989
24
                res_offsets.insert(col_source->get_offsets().begin(),
990
24
                                   col_source->get_offsets().end());
991
24
                res_chars.insert(col_source->get_chars().begin(), col_source->get_chars().end());
992
214
                for (int i = 0; i < res_chars.size(); ++i) {
993
190
                    res_chars[i] = map[res_chars[i]]; // translate the chars
994
190
                }
995
24
                return; // no need to translate
996
24
            }
997
34
        }
998
999
98
        auto res_size = 0;
1000
98
        auto* begin_data = col_res->get_chars().data();
1001
216
        for (size_t i = 0; i < col_source->size(); ++i) {
1002
118
            const auto& source_str = col_source->get_data_at(i);
1003
118
            if (!IsConst) {
1004
104
                const auto& from_str = col_from->get_data_at(i);
1005
104
                const auto& to_str = col_to->get_data_at(i);
1006
104
                build_translate_map_ascii(map, from_str, to_str);
1007
104
            }
1008
118
            auto* dst_data = begin_data + res_size;
1009
118
            res_size += translate_ascii(source_str, map, dst_data);
1010
1011
118
            res_offsets.push_back(res_size);
1012
118
        }
1013
98
        DCHECK_GE(res_chars.capacity(), res_size);
1014
98
        res_chars.resize(res_size);
1015
98
    }
_ZN5doris17FunctionTranslate18impl_vectors_asciiILb1EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
Line
Count
Source
977
34
                                   const ColumnString* col_to, ColumnString* col_res) {
978
34
        auto& res_chars = col_res->get_chars();
979
34
        auto& res_offsets = col_res->get_offsets();
980
34
        res_chars.reserve(col_source->get_chars().size());
981
34
        res_offsets.reserve(col_source->get_offsets().size());
982
34
        DCHECK_EQ(col_res->size(), 0);
983
34
        AsciiMap map;
984
34
        if (IsConst) {
985
34
            const auto& from_str = col_from->get_data_at(0);
986
34
            const auto& to_str = col_to->get_data_at(0);
987
34
            if (!build_translate_map_ascii(map, from_str, to_str)) {
988
                // if the map is not need delete char, we can directly copy the source string,then use map to translate
989
24
                res_offsets.insert(col_source->get_offsets().begin(),
990
24
                                   col_source->get_offsets().end());
991
24
                res_chars.insert(col_source->get_chars().begin(), col_source->get_chars().end());
992
214
                for (int i = 0; i < res_chars.size(); ++i) {
993
190
                    res_chars[i] = map[res_chars[i]]; // translate the chars
994
190
                }
995
24
                return; // no need to translate
996
24
            }
997
34
        }
998
999
10
        auto res_size = 0;
1000
10
        auto* begin_data = col_res->get_chars().data();
1001
24
        for (size_t i = 0; i < col_source->size(); ++i) {
1002
14
            const auto& source_str = col_source->get_data_at(i);
1003
14
            if (!IsConst) {
1004
0
                const auto& from_str = col_from->get_data_at(i);
1005
0
                const auto& to_str = col_to->get_data_at(i);
1006
0
                build_translate_map_ascii(map, from_str, to_str);
1007
0
            }
1008
14
            auto* dst_data = begin_data + res_size;
1009
14
            res_size += translate_ascii(source_str, map, dst_data);
1010
1011
14
            res_offsets.push_back(res_size);
1012
14
        }
1013
        DCHECK_GE(res_chars.capacity(), res_size);
1014
10
        res_chars.resize(res_size);
1015
10
    }
_ZN5doris17FunctionTranslate18impl_vectors_asciiILb0EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
Line
Count
Source
977
88
                                   const ColumnString* col_to, ColumnString* col_res) {
978
88
        auto& res_chars = col_res->get_chars();
979
88
        auto& res_offsets = col_res->get_offsets();
980
88
        res_chars.reserve(col_source->get_chars().size());
981
88
        res_offsets.reserve(col_source->get_offsets().size());
982
88
        DCHECK_EQ(col_res->size(), 0);
983
88
        AsciiMap map;
984
88
        if (IsConst) {
985
0
            const auto& from_str = col_from->get_data_at(0);
986
0
            const auto& to_str = col_to->get_data_at(0);
987
0
            if (!build_translate_map_ascii(map, from_str, to_str)) {
988
                // if the map is not need delete char, we can directly copy the source string,then use map to translate
989
0
                res_offsets.insert(col_source->get_offsets().begin(),
990
0
                                   col_source->get_offsets().end());
991
0
                res_chars.insert(col_source->get_chars().begin(), col_source->get_chars().end());
992
0
                for (int i = 0; i < res_chars.size(); ++i) {
993
0
                    res_chars[i] = map[res_chars[i]]; // translate the chars
994
0
                }
995
0
                return; // no need to translate
996
0
            }
997
0
        }
998
999
88
        auto res_size = 0;
1000
88
        auto* begin_data = col_res->get_chars().data();
1001
192
        for (size_t i = 0; i < col_source->size(); ++i) {
1002
104
            const auto& source_str = col_source->get_data_at(i);
1003
104
            if (!IsConst) {
1004
104
                const auto& from_str = col_from->get_data_at(i);
1005
104
                const auto& to_str = col_to->get_data_at(i);
1006
104
                build_translate_map_ascii(map, from_str, to_str);
1007
104
            }
1008
104
            auto* dst_data = begin_data + res_size;
1009
104
            res_size += translate_ascii(source_str, map, dst_data);
1010
1011
104
            res_offsets.push_back(res_size);
1012
104
        }
1013
        DCHECK_GE(res_chars.capacity(), res_size);
1014
88
        res_chars.resize(res_size);
1015
88
    }
1016
1017
    // return true if no need delete char
1018
    bool static build_translate_map_ascii(AsciiMap& map, const StringRef& from_str,
1019
138
                                          const StringRef& to_str) {
1020
17.8k
        for (size_t i = 0; i < map.size(); ++i) {
1021
17.6k
            map[i] = i; // initialize map to identity
1022
17.6k
        }
1023
138
        std::array<UInt8, 128> set_map {0};
1024
138
        const auto min_size = std::min(from_str.size, to_str.size);
1025
        // all ascii characters are in the range [0, 127]
1026
476
        for (size_t i = 0; i < min_size; ++i) {
1027
338
            auto from_char = from_str.data[i];
1028
338
            auto to_char = to_str.data[i];
1029
338
            if (set_map[from_char] == 0) {
1030
243
                set_map[from_char] = 1;
1031
243
                map[from_char] = to_char;
1032
243
            }
1033
338
        }
1034
1035
138
        bool need_delete_char = false;
1036
1037
207
        for (size_t i = min_size; i < from_str.size; ++i) {
1038
69
            auto from_char = from_str.data[i];
1039
69
            if (set_map[from_char] == 0) {
1040
57
                set_map[from_char] = 1;
1041
57
                map[from_char] = DELETE_CHAR; // delete this char
1042
57
                need_delete_char = true;
1043
57
            }
1044
69
        }
1045
138
        return need_delete_char;
1046
138
    }
1047
1048
118
    static size_t translate_ascii(const StringRef& source_str, AsciiMap& map, UInt8* dst_data) {
1049
118
        auto* begin_data = dst_data;
1050
640
        for (size_t i = 0; i < source_str.size; ++i) {
1051
522
            auto c = source_str.data[i];
1052
522
            if (map[c] == DELETE_CHAR) {
1053
35
                continue; // delete this char
1054
35
            }
1055
487
            *dst_data++ = map[c];
1056
487
        }
1057
118
        return dst_data - begin_data;
1058
118
    }
1059
1060
    template <bool IsConst>
1061
    static void impl_vectors_utf8(const ColumnString* col_source, const ColumnString* col_from,
1062
53
                                  const ColumnString* col_to, ColumnString* col_res) {
1063
53
        col_res->get_chars().reserve(col_source->get_chars().size());
1064
53
        col_res->get_offsets().reserve(col_source->get_offsets().size());
1065
53
        std::unordered_map<std::string_view, std::string_view> translate_map;
1066
53
        if (IsConst) {
1067
1
            const auto& from_str = col_from->get_data_at(0);
1068
1
            const auto& to_str = col_to->get_data_at(0);
1069
1
            translate_map =
1070
1
                    build_translate_map_utf8(from_str.to_string_view(), to_str.to_string_view());
1071
1
        }
1072
344
        for (size_t i = 0; i < col_source->size(); ++i) {
1073
291
            const auto& source_str = col_source->get_data_at(i);
1074
291
            if (!IsConst) {
1075
290
                const auto& from_str = col_from->get_data_at(i);
1076
290
                const auto& to_str = col_to->get_data_at(i);
1077
290
                translate_map = build_translate_map_utf8(from_str.to_string_view(),
1078
290
                                                         to_str.to_string_view());
1079
290
            }
1080
291
            auto translated_str = translate_utf8(source_str.to_string_view(), translate_map);
1081
291
            col_res->insert_data(translated_str.data(), translated_str.size());
1082
291
        }
1083
53
    }
_ZN5doris17FunctionTranslate17impl_vectors_utf8ILb0EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
Line
Count
Source
1062
52
                                  const ColumnString* col_to, ColumnString* col_res) {
1063
52
        col_res->get_chars().reserve(col_source->get_chars().size());
1064
52
        col_res->get_offsets().reserve(col_source->get_offsets().size());
1065
52
        std::unordered_map<std::string_view, std::string_view> translate_map;
1066
52
        if (IsConst) {
1067
0
            const auto& from_str = col_from->get_data_at(0);
1068
0
            const auto& to_str = col_to->get_data_at(0);
1069
0
            translate_map =
1070
0
                    build_translate_map_utf8(from_str.to_string_view(), to_str.to_string_view());
1071
0
        }
1072
342
        for (size_t i = 0; i < col_source->size(); ++i) {
1073
290
            const auto& source_str = col_source->get_data_at(i);
1074
290
            if (!IsConst) {
1075
290
                const auto& from_str = col_from->get_data_at(i);
1076
290
                const auto& to_str = col_to->get_data_at(i);
1077
290
                translate_map = build_translate_map_utf8(from_str.to_string_view(),
1078
290
                                                         to_str.to_string_view());
1079
290
            }
1080
290
            auto translated_str = translate_utf8(source_str.to_string_view(), translate_map);
1081
290
            col_res->insert_data(translated_str.data(), translated_str.size());
1082
290
        }
1083
52
    }
_ZN5doris17FunctionTranslate17impl_vectors_utf8ILb1EEEvPKNS_9ColumnStrIjEES5_S5_PS3_
Line
Count
Source
1062
1
                                  const ColumnString* col_to, ColumnString* col_res) {
1063
1
        col_res->get_chars().reserve(col_source->get_chars().size());
1064
1
        col_res->get_offsets().reserve(col_source->get_offsets().size());
1065
1
        std::unordered_map<std::string_view, std::string_view> translate_map;
1066
1
        if (IsConst) {
1067
1
            const auto& from_str = col_from->get_data_at(0);
1068
1
            const auto& to_str = col_to->get_data_at(0);
1069
1
            translate_map =
1070
1
                    build_translate_map_utf8(from_str.to_string_view(), to_str.to_string_view());
1071
1
        }
1072
2
        for (size_t i = 0; i < col_source->size(); ++i) {
1073
1
            const auto& source_str = col_source->get_data_at(i);
1074
1
            if (!IsConst) {
1075
0
                const auto& from_str = col_from->get_data_at(i);
1076
0
                const auto& to_str = col_to->get_data_at(i);
1077
0
                translate_map = build_translate_map_utf8(from_str.to_string_view(),
1078
0
                                                         to_str.to_string_view());
1079
0
            }
1080
1
            auto translated_str = translate_utf8(source_str.to_string_view(), translate_map);
1081
1
            col_res->insert_data(translated_str.data(), translated_str.size());
1082
1
        }
1083
1
    }
1084
1085
    static std::unordered_map<std::string_view, std::string_view> build_translate_map_utf8(
1086
291
            const std::string_view& from_str, const std::string_view& to_str) {
1087
291
        std::unordered_map<std::string_view, std::string_view> translate_map;
1088
1.77k
        for (size_t i = 0, from_char_size = 0, j = 0, to_char_size = 0; i < from_str.size();
1089
1.48k
             i += from_char_size, j += to_char_size) {
1090
1.48k
            from_char_size = get_utf8_byte_length(from_str[i]);
1091
1.48k
            to_char_size = j < to_str.size() ? get_utf8_byte_length(to_str[j]) : 0;
1092
1.48k
            auto from_char = from_str.substr(i, from_char_size);
1093
1.48k
            if (translate_map.find(from_char) == translate_map.end()) {
1094
799
                translate_map[from_char] =
1095
799
                        j < to_str.size() ? to_str.substr(j, to_char_size) : std::string_view();
1096
799
            }
1097
1.48k
        }
1098
291
        return translate_map;
1099
291
    }
1100
1101
    static std::string translate_utf8(
1102
            const std::string_view& source_str,
1103
291
            std::unordered_map<std::string_view, std::string_view>& translate_map) {
1104
291
        std::string result;
1105
291
        result.reserve(source_str.size());
1106
1.71k
        for (size_t i = 0, char_size = 0; i < source_str.size(); i += char_size) {
1107
1.42k
            char_size = get_utf8_byte_length(source_str[i]);
1108
1.42k
            auto c = source_str.substr(i, char_size);
1109
1.42k
            if (translate_map.find(c) != translate_map.end()) {
1110
255
                if (!translate_map[c].empty()) {
1111
159
                    result.append(translate_map[c]);
1112
159
                }
1113
1.17k
            } else {
1114
1.17k
                result.append(c);
1115
1.17k
            }
1116
1.42k
        }
1117
291
        return result;
1118
291
    }
1119
};
1120
1121
/// xpath_string(xml, xpath) -> String
1122
/// Returns the text content of the first node that matches the XPath expression.
1123
/// Returns NULL if either xml or xpath is NULL.
1124
/// Returns empty string if the XPath expression matches no nodes.
1125
/// The text content includes the node and all its descendants.
1126
/// Example:
1127
///   xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/b[1]') = 'b1'
1128
///   xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/b[2]') = 'b2'
1129
///   xpath_string('<a><b>b1</b><b>b2</b></a>', '/a/c') = ''
1130
///   xpath_string('invalid xml', '/a/b[1]') = NULL
1131
///   xpath_string(NULL, '/a/b[1]') = NULL
1132
///   xpath_string('<a><b>b1</b><b>b2</b></a>', NULL) = NULL
1133
class FunctionXPathString : public IFunction {
1134
public:
1135
    static constexpr auto name = "xpath_string";
1136
173
    static FunctionPtr create() { return std::make_shared<FunctionXPathString>(); }
1137
1
    String get_name() const override { return name; }
1138
164
    size_t get_number_of_arguments() const override { return 2; }
1139
164
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1140
164
        return make_nullable(std::make_shared<DataTypeString>());
1141
164
    }
1142
1143
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1144
246
                        uint32_t result, size_t input_rows_count) const override {
1145
246
        CHECK_EQ(arguments.size(), 2);
1146
246
        auto col_res = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create());
1147
246
        const auto& [left_col, left_const] =
1148
246
                unpack_if_const(block.get_by_position(arguments[0]).column);
1149
246
        const auto& [right_col, right_const] =
1150
246
                unpack_if_const(block.get_by_position(arguments[1]).column);
1151
246
        const auto& xml_col = *assert_cast<const ColumnString*>(left_col.get());
1152
246
        const auto& xpath_col = *assert_cast<const ColumnString*>(right_col.get());
1153
1154
246
        Status status;
1155
246
        if (left_const && right_const) {
1156
0
            status = execute_vector<true, true>(input_rows_count, xml_col, xpath_col, *col_res);
1157
246
        } else if (left_const) {
1158
42
            status = execute_vector<true, false>(input_rows_count, xml_col, xpath_col, *col_res);
1159
204
        } else if (right_const) {
1160
51
            status = execute_vector<false, true>(input_rows_count, xml_col, xpath_col, *col_res);
1161
153
        } else {
1162
153
            status = execute_vector<false, false>(input_rows_count, xml_col, xpath_col, *col_res);
1163
153
        }
1164
246
        if (!status.ok()) {
1165
1
            return status;
1166
1
        }
1167
1168
245
        block.get_by_position(result).column = std::move(col_res);
1169
245
        return Status::OK();
1170
246
    }
1171
1172
private:
1173
331
    static Status parse_xml(const StringRef& xml_str, pugi::xml_document& xml_doc) {
1174
331
        pugi::xml_parse_result result = xml_doc.load_buffer(xml_str.data, xml_str.size);
1175
331
        if (!result) {
1176
1
            return Status::InvalidArgument("Function {} failed to parse XML string: {}", name,
1177
1
                                           result.description());
1178
1
        }
1179
330
        return Status::OK();
1180
331
    }
1181
1182
340
    static Status build_xpath_query(const StringRef& xpath_str, pugi::xpath_query& xpath_query) {
1183
        // xpath_query will throws xpath_exception on compilation errors.
1184
340
        try {
1185
            // NOTE!!!: don't use to_string_view(), because xpath_str maybe not null-terminated
1186
340
            xpath_query = pugi::xpath_query(xpath_str.to_string().c_str());
1187
340
        } catch (const pugi::xpath_exception& e) {
1188
0
            return Status::InvalidArgument("Function {} failed to build XPath query: {}", name,
1189
0
                                           e.what());
1190
0
        }
1191
340
        return Status::OK();
1192
340
    }
1193
1194
    template <bool left_const, bool right_const>
1195
    static Status execute_vector(const size_t input_rows_count, const ColumnString& xml_col,
1196
246
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
246
        pugi::xml_document xml_doc;
1198
246
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
246
        if constexpr (right_const) {
1201
51
            auto xpath_str = xpath_col.get_data_at(0);
1202
51
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
1
                res_col.insert_many_defaults(input_rows_count);
1205
1
                return Status::OK();
1206
1
            }
1207
50
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
50
        }
1209
50
        if constexpr (left_const) {
1210
42
            auto xml_str = xml_col.get_data_at(0);
1211
42
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
1
                res_col.insert_many_defaults(input_rows_count);
1214
1
                return Status::OK();
1215
1
            }
1216
41
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
41
        }
1218
1219
633
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
388
            if constexpr (!right_const) {
1221
308
                auto xpath_str = xpath_col.get_data_at(i);
1222
308
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
18
                    res_col.insert_default();
1225
18
                    continue;
1226
18
                }
1227
290
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
290
            }
1229
327
            if constexpr (!left_const) {
1230
327
                auto xml_str = xml_col.get_data_at(i);
1231
327
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
20
                    res_col.insert_default();
1234
20
                    continue;
1235
20
                }
1236
307
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
307
            }
1238
306
            std::string text;
1239
388
            try {
1240
388
                text = xpath_query.evaluate_string(xml_doc);
1241
388
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
349
            res_col.insert_data(text.data(), text.size());
1246
349
        }
1247
245
        return Status::OK();
1248
246
    }
Unexecuted instantiation: _ZN5doris19FunctionXPathString14execute_vectorILb1ELb1EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
_ZN5doris19FunctionXPathString14execute_vectorILb1ELb0EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
Line
Count
Source
1196
42
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
42
        pugi::xml_document xml_doc;
1198
42
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
        if constexpr (right_const) {
1201
            auto xpath_str = xpath_col.get_data_at(0);
1202
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
                res_col.insert_many_defaults(input_rows_count);
1205
                return Status::OK();
1206
            }
1207
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
        }
1209
42
        if constexpr (left_const) {
1210
42
            auto xml_str = xml_col.get_data_at(0);
1211
42
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
1
                res_col.insert_many_defaults(input_rows_count);
1214
1
                return Status::OK();
1215
1
            }
1216
41
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
41
        }
1218
1219
103
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
61
            if constexpr (!right_const) {
1221
61
                auto xpath_str = xpath_col.get_data_at(i);
1222
61
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
1
                    res_col.insert_default();
1225
1
                    continue;
1226
1
                }
1227
60
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
60
            }
1229
            if constexpr (!left_const) {
1230
                auto xml_str = xml_col.get_data_at(i);
1231
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
                    res_col.insert_default();
1234
                    continue;
1235
                }
1236
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
            }
1238
61
            std::string text;
1239
61
            try {
1240
61
                text = xpath_query.evaluate_string(xml_doc);
1241
61
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
60
            res_col.insert_data(text.data(), text.size());
1246
60
        }
1247
42
        return Status::OK();
1248
42
    }
_ZN5doris19FunctionXPathString14execute_vectorILb0ELb1EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
Line
Count
Source
1196
51
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
51
        pugi::xml_document xml_doc;
1198
51
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
51
        if constexpr (right_const) {
1201
51
            auto xpath_str = xpath_col.get_data_at(0);
1202
51
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
1
                res_col.insert_many_defaults(input_rows_count);
1205
1
                return Status::OK();
1206
1
            }
1207
50
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
50
        }
1209
        if constexpr (left_const) {
1210
            auto xml_str = xml_col.get_data_at(0);
1211
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
                res_col.insert_many_defaults(input_rows_count);
1214
                return Status::OK();
1215
            }
1216
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
        }
1218
1219
131
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
            if constexpr (!right_const) {
1221
                auto xpath_str = xpath_col.get_data_at(i);
1222
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
                    res_col.insert_default();
1225
                    continue;
1226
                }
1227
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
            }
1229
80
            if constexpr (!left_const) {
1230
80
                auto xml_str = xml_col.get_data_at(i);
1231
80
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
5
                    res_col.insert_default();
1234
5
                    continue;
1235
5
                }
1236
75
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
75
            }
1238
75
            std::string text;
1239
80
            try {
1240
80
                text = xpath_query.evaluate_string(xml_doc);
1241
80
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
75
            res_col.insert_data(text.data(), text.size());
1246
75
        }
1247
51
        return Status::OK();
1248
51
    }
_ZN5doris19FunctionXPathString14execute_vectorILb0ELb0EEENS_6StatusEmRKNS_9ColumnStrIjEES6_RNS_14ColumnNullableE
Line
Count
Source
1196
153
                                 const ColumnString& xpath_col, ColumnNullable& res_col) {
1197
153
        pugi::xml_document xml_doc;
1198
153
        pugi::xpath_query xpath_query;
1199
        // first check right_const, because we want to check empty input first
1200
        if constexpr (right_const) {
1201
            auto xpath_str = xpath_col.get_data_at(0);
1202
            if (xpath_str.empty()) {
1203
                // should return null if xpath_str is empty
1204
                res_col.insert_many_defaults(input_rows_count);
1205
                return Status::OK();
1206
            }
1207
            RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1208
        }
1209
        if constexpr (left_const) {
1210
            auto xml_str = xml_col.get_data_at(0);
1211
            if (xml_str.empty()) {
1212
                // should return null if xml_str is empty
1213
                res_col.insert_many_defaults(input_rows_count);
1214
                return Status::OK();
1215
            }
1216
            RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1217
        }
1218
1219
399
        for (size_t i = 0; i < input_rows_count; ++i) {
1220
247
            if constexpr (!right_const) {
1221
247
                auto xpath_str = xpath_col.get_data_at(i);
1222
247
                if (xpath_str.empty()) {
1223
                    // should return null if xpath_str is empty
1224
17
                    res_col.insert_default();
1225
17
                    continue;
1226
17
                }
1227
230
                RETURN_IF_ERROR(build_xpath_query(xpath_str, xpath_query));
1228
230
            }
1229
247
            if constexpr (!left_const) {
1230
247
                auto xml_str = xml_col.get_data_at(i);
1231
247
                if (xml_str.empty()) {
1232
                    // should return null if xml_str is empty
1233
15
                    res_col.insert_default();
1234
15
                    continue;
1235
15
                }
1236
232
                RETURN_IF_ERROR(parse_xml(xml_str, xml_doc));
1237
232
            }
1238
231
            std::string text;
1239
247
            try {
1240
247
                text = xpath_query.evaluate_string(xml_doc);
1241
247
            } catch (const pugi::xpath_exception& e) {
1242
0
                return Status::InvalidArgument("Function {} failed to query XPath string: {}", name,
1243
0
                                               e.what());
1244
0
            }
1245
214
            res_col.insert_data(text.data(), text.size());
1246
214
        }
1247
152
        return Status::OK();
1248
153
    }
1249
};
1250
1251
class MakeSetImpl {
1252
public:
1253
    static constexpr auto name = "make_set";
1254
1255
0
    static size_t get_number_of_arguments() { return 0; }
1256
36
    static bool is_variadic() { return true; }
1257
35
    static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
1258
35
        if (arguments[0].get()->is_nullable()) {
1259
12
            return make_nullable(std::make_shared<DataTypeString>());
1260
12
        }
1261
23
        return std::make_shared<DataTypeString>();
1262
35
    }
1263
1264
    static bool is_return_nullable(bool has_nullable,
1265
35
                                   const std::vector<ColumnWithConstAndNullMap>& cols_info) {
1266
35
        return cols_info[0].null_map != nullptr;
1267
35
    }
1268
1269
    static bool execute_const_null(ColumnString::MutablePtr& res_col,
1270
                                   PaddedPODArray<UInt8>& res_null_map_data,
1271
2
                                   size_t input_rows_count, size_t null_index) {
1272
2
        if (null_index == 1) {
1273
0
            res_col->insert_many_defaults(input_rows_count);
1274
0
            res_null_map_data.assign(input_rows_count, (UInt8)1);
1275
0
            return true;
1276
0
        }
1277
2
        return false;
1278
2
    }
1279
1280
    static void execute(const std::vector<ColumnWithConstAndNullMap>& column_infos,
1281
                        ColumnString::MutablePtr& res_col, PaddedPODArray<UInt8>& res_null_map_data,
1282
35
                        size_t input_rows_count) {
1283
35
        static constexpr char SEPARATOR = ',';
1284
35
        const auto& bit_data =
1285
35
                assert_cast<const ColumnInt64&>(*column_infos[0].nested_col).get_data();
1286
35
        std::vector<const ColumnString*> str_cols(column_infos.size());
1287
249
        for (size_t i = 1; i < column_infos.size(); ++i) {
1288
214
            str_cols[i] = assert_cast<const ColumnString*>(column_infos[i].nested_col);
1289
214
        }
1290
1291
200
        for (size_t row = 0; row < input_rows_count; ++row) {
1292
165
            if (column_infos[0].is_null_at(row)) {
1293
10
                res_col->insert_default();
1294
10
                res_null_map_data[row] = 1;
1295
10
                continue;
1296
10
            }
1297
1298
155
            uint64_t bit = bit_data[column_infos[0].is_const ? 0 : row];
1299
155
            uint64_t col_pos = __builtin_ffsll(bit);
1300
155
            ColumnString::Chars data;
1301
452
            while (col_pos != 0 && col_pos < column_infos.size() && bit != 0) {
1302
297
                if (!column_infos[col_pos].is_null_at(row)) {
1303
                    /* Here insert `str,` directly to support the case below:
1304
                     * SELECT MAKE_SET(3, '', 'a');
1305
                     * the exception result should be ',a'.
1306
                     */
1307
259
                    auto s_ref = str_cols[col_pos]->get_data_at(
1308
259
                            column_infos[col_pos].is_const ? 0 : row);
1309
259
                    data.insert(s_ref.data, s_ref.data + s_ref.size);
1310
259
                    data.push_back(SEPARATOR);
1311
259
                }
1312
297
                bit &= ~(1ULL << (col_pos - 1));
1313
297
                col_pos = __builtin_ffsll(bit);
1314
297
            }
1315
            // remove the last ','
1316
155
            if (!data.empty()) {
1317
140
                data.pop_back();
1318
140
            }
1319
155
            res_col->insert_data(reinterpret_cast<const char*>(data.data()), data.size());
1320
155
        }
1321
35
    }
1322
};
1323
1324
class FunctionExportSet : public IFunction {
1325
public:
1326
    static constexpr auto name = "export_set";
1327
78
    static FunctionPtr create() { return std::make_shared<FunctionExportSet>(); }
1328
0
    String get_name() const override { return name; }
1329
0
    size_t get_number_of_arguments() const override { return 0; }
1330
70
    bool is_variadic() const override { return true; }
1331
69
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1332
69
        return std::make_shared<DataTypeString>();
1333
69
    }
1334
1335
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1336
69
                        uint32_t result, size_t input_rows_count) const override {
1337
69
        auto res_col = ColumnString::create();
1338
1339
69
        const size_t arg_size = arguments.size();
1340
69
        bool col_const[5];
1341
69
        ColumnPtr arg_cols[5];
1342
69
        bool all_const = true;
1343
244
        for (int i = 1; i < arg_size; ++i) {
1344
175
            col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column);
1345
175
            all_const = all_const && col_const[i];
1346
175
        }
1347
69
        std::tie(arg_cols[0], col_const[0]) =
1348
69
                unpack_if_const(block.get_by_position(arguments[0]).column);
1349
69
        if (arg_size == 3) {
1350
49
            default_preprocess_parameter_columns(arg_cols, col_const, {1, 2}, block, arguments);
1351
49
        } else if (arg_size == 4) {
1352
3
            default_preprocess_parameter_columns(arg_cols, col_const, {1, 2, 3}, block, arguments);
1353
17
        } else if (arg_size == 5) {
1354
17
            default_preprocess_parameter_columns(arg_cols, col_const, {1, 2, 3, 4}, block,
1355
17
                                                 arguments);
1356
17
        }
1357
1358
69
        const auto* bit_col = assert_cast<const ColumnInt128*>(arg_cols[0].get());
1359
69
        const auto* on_col = assert_cast<const ColumnString*>(arg_cols[1].get());
1360
69
        const auto* off_col = assert_cast<const ColumnString*>(arg_cols[2].get());
1361
69
        const ColumnString* sep_col = nullptr;
1362
69
        const ColumnInt32* num_bits_col = nullptr;
1363
69
        if (arg_size > 3) {
1364
20
            sep_col = assert_cast<const ColumnString*>(arg_cols[3].get());
1365
20
            if (arg_size == 5) {
1366
17
                num_bits_col = assert_cast<const ColumnInt32*>(arg_cols[4].get());
1367
17
            }
1368
20
        }
1369
1370
246
        for (size_t i = 0; i < input_rows_count; ++i) {
1371
177
            uint64_t bit =
1372
177
                    check_and_get_bit(bit_col->get_element(index_check_const(i, col_const[0])));
1373
1374
177
            size_t idx_for_args = all_const ? 0 : i;
1375
177
            StringRef on = on_col->get_data_at(idx_for_args);
1376
177
            StringRef off = off_col->get_data_at(idx_for_args);
1377
177
            StringRef separator(",", 1);
1378
177
            int8_t num_of_bits = 64;
1379
1380
177
            if (arg_size > 3) {
1381
104
                separator = sep_col->get_data_at(idx_for_args);
1382
104
                if (arg_size == 5) {
1383
77
                    num_of_bits =
1384
77
                            check_and_get_num_of_bits(num_bits_col->get_element(idx_for_args));
1385
77
                }
1386
104
            }
1387
1388
177
            execute_single(bit, on, off, separator, num_of_bits, *res_col);
1389
177
        }
1390
69
        block.replace_by_position(result, std::move(res_col));
1391
69
        return Status::OK();
1392
69
    }
1393
1394
private:
1395
    /* The valid range of the input `bit` parameter should be [-2^63, 2^64 - 1]
1396
     * If it exceeds this range, the MAX/MIN values of the signed 64-bit integer are used for calculation
1397
     * This behavior is consistent with MySQL.
1398
     */
1399
177
    uint64_t check_and_get_bit(__int128 col_bit_val) const {
1400
177
        if (col_bit_val > ULLONG_MAX) {
1401
3
            return LLONG_MAX;
1402
174
        } else if (col_bit_val < LLONG_MIN) {
1403
1
            return LLONG_MIN;
1404
1
        }
1405
173
        return static_cast<uint64_t>(col_bit_val);
1406
177
    }
1407
1408
    // If the input value is not in the range [0, 64], return default value 64
1409
77
    int8_t check_and_get_num_of_bits(int32_t col_num_of_bits_val) const {
1410
77
        if (col_num_of_bits_val >= 0 && col_num_of_bits_val <= 64) {
1411
71
            return static_cast<int8_t>(col_num_of_bits_val);
1412
71
        }
1413
6
        return 64;
1414
77
    }
1415
1416
    void execute_single(uint64_t bit, const StringRef& on, const StringRef& off,
1417
                        const StringRef& separator, int8_t num_of_bits,
1418
177
                        ColumnString& res_col) const {
1419
177
        ColumnString::Chars data;
1420
177
        data.reserve(std::max(on.size, off.size) * num_of_bits +
1421
177
                     separator.size * (num_of_bits - 1));
1422
1423
5.03k
        while (bit && num_of_bits) {
1424
4.86k
            if (bit & 1) {
1425
3.04k
                data.insert(on.data, on.data + on.size);
1426
3.04k
            } else {
1427
1.82k
                data.insert(off.data, off.data + off.size);
1428
1.82k
            }
1429
4.86k
            bit >>= 1;
1430
4.86k
            if (--num_of_bits) {
1431
4.79k
                data.insert(separator.data, separator.data + separator.size);
1432
4.79k
            }
1433
4.86k
        }
1434
1435
177
        if (num_of_bits > 0) {
1436
111
            ColumnString::Chars off_sep_combo;
1437
111
            off_sep_combo.reserve(separator.size + off.size);
1438
111
            off_sep_combo.insert(off_sep_combo.end(), off.data, off.data + off.size);
1439
111
            off_sep_combo.insert(off_sep_combo.end(), separator.data,
1440
111
                                 separator.data + separator.size);
1441
1442
3.30k
            for (size_t i = 0; i < num_of_bits; ++i) {
1443
3.19k
                data.insert(off_sep_combo.data(), off_sep_combo.data() + off_sep_combo.size());
1444
3.19k
            }
1445
111
            data.erase(data.end() - separator.size, data.end());
1446
111
        }
1447
1448
177
        res_col.insert_data(reinterpret_cast<const char*>(data.data()), data.size());
1449
177
    }
1450
};
1451
1452
// ATTN: for debug only
1453
// compute crc32 hash value as the same way in `VOlapTablePartitionParam::find_tablets()`
1454
class FunctionCrc32Internal : public IFunction {
1455
public:
1456
    static constexpr auto name = "crc32_internal";
1457
44.1k
    static FunctionPtr create() { return std::make_shared<FunctionCrc32Internal>(); }
1458
0
    String get_name() const override { return name; }
1459
0
    size_t get_number_of_arguments() const override { return 0; }
1460
44.0k
    bool is_variadic() const override { return true; }
1461
59.2k
    bool use_default_implementation_for_nulls() const override { return false; }
1462
44.0k
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1463
44.0k
        return std::make_shared<DataTypeInt64>();
1464
44.0k
    }
1465
1466
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1467
15.0k
                        uint32_t result, size_t input_rows_count) const override {
1468
15.0k
        DCHECK_GE(arguments.size(), 1);
1469
1470
15.0k
        auto argument_size = arguments.size();
1471
15.0k
        std::vector<ColumnPtr> argument_columns(argument_size);
1472
15.0k
        std::vector<PrimitiveType> argument_primitive_types(argument_size);
1473
1474
30.4k
        for (size_t i = 0; i < argument_size; ++i) {
1475
15.3k
            argument_columns[i] =
1476
15.3k
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
1477
15.3k
            argument_primitive_types[i] =
1478
15.3k
                    block.get_by_position(arguments[i]).type->get_primitive_type();
1479
15.3k
        }
1480
1481
15.0k
        auto res_col = ColumnInt64::create();
1482
15.0k
        auto& res_data = res_col->get_data();
1483
15.0k
        res_data.resize_fill(input_rows_count, 0);
1484
1485
14.9M
        for (size_t i = 0; i < input_rows_count; ++i) {
1486
14.9M
            uint32_t hash_val = 0;
1487
29.8M
            for (size_t j = 0; j < argument_size; ++j) {
1488
14.9M
                const auto& column = argument_columns[j];
1489
14.9M
                auto primitive_type = argument_primitive_types[j];
1490
14.9M
                auto val = column->get_data_at(i);
1491
14.9M
                if (val.data != nullptr) {
1492
14.9M
                    hash_val = RawValue::zlib_crc32(val.data, val.size, primitive_type, hash_val);
1493
14.9M
                } else {
1494
4.96k
                    hash_val = HashUtil::zlib_crc_hash_null(hash_val);
1495
4.96k
                }
1496
14.9M
            }
1497
14.9M
            res_data[i] = hash_val;
1498
14.9M
        }
1499
1500
15.0k
        block.replace_by_position(result, std::move(res_col));
1501
15.0k
        return Status::OK();
1502
15.0k
    }
1503
};
1504
1505
class FunctionUnicodeNormalize : public IFunction {
1506
public:
1507
    static constexpr auto name = "unicode_normalize";
1508
1509
24
    static FunctionPtr create() { return std::make_shared<FunctionUnicodeNormalize>(); }
1510
1511
5
    String get_name() const override { return name; }
1512
1513
15
    size_t get_number_of_arguments() const override { return 2; }
1514
1515
15
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
1516
15
        if (arguments.size() != 2 || !is_string_type(arguments[0]->get_primitive_type()) ||
1517
15
            !is_string_type(arguments[1]->get_primitive_type())) {
1518
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
1519
0
                                   "Illegal type {} and {} of arguments of function {}",
1520
0
                                   arguments[0]->get_name(), arguments[1]->get_name(), get_name());
1521
0
        }
1522
15
        return arguments[0];
1523
15
    }
1524
1525
16
    ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; }
1526
1527
32
    Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
1528
32
        if (scope == FunctionContext::THREAD_LOCAL) {
1529
17
            return Status::OK();
1530
17
        }
1531
1532
15
        if (!context->is_col_constant(1)) {
1533
2
            return Status::InvalidArgument(
1534
2
                    "The second argument 'mode' of function {} must be constant", get_name());
1535
2
        }
1536
1537
13
        auto* const_col = context->get_constant_col(1);
1538
13
        auto mode_ref = const_col->column_ptr->get_data_at(0);
1539
13
        std::string lower_mode = doris::to_lower(std::string(doris::trim(mode_ref.to_string())));
1540
1541
13
        UErrorCode status = U_ZERO_ERROR;
1542
13
        const icu::Normalizer2* normalizer = nullptr;
1543
1544
13
        if (lower_mode == "nfc") {
1545
5
            normalizer = icu::Normalizer2::getInstance(nullptr, "nfc", UNORM2_COMPOSE, status);
1546
8
        } else if (lower_mode == "nfd") {
1547
2
            normalizer = icu::Normalizer2::getNFDInstance(status);
1548
6
        } else if (lower_mode == "nfkc") {
1549
0
            normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status);
1550
6
        } else if (lower_mode == "nfkd") {
1551
2
            normalizer = icu::Normalizer2::getNFKDInstance(status);
1552
4
        } else if (lower_mode == "nfkc_cf") {
1553
2
            normalizer = icu::Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, status);
1554
2
        } else {
1555
2
            return Status::InvalidArgument(
1556
2
                    "Invalid normalization mode '{}' for function {}. "
1557
2
                    "Supported modes: NFC, NFD, NFKC, NFKD, NFKC_CF",
1558
2
                    lower_mode, get_name());
1559
2
        }
1560
1561
11
        if (U_FAILURE(status) || normalizer == nullptr) {
1562
0
            return Status::InvalidArgument(
1563
0
                    "Failed to get normalizer instance for mode '{}' in function {}: {}",
1564
0
                    lower_mode, get_name(), u_errorName(status));
1565
0
        }
1566
1567
11
        auto state = std::make_shared<UnicodeNormalizeState>();
1568
11
        state->normalizer = normalizer;
1569
11
        context->set_function_state(scope, state);
1570
11
        return Status::OK();
1571
11
    }
1572
1573
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
1574
11
                        uint32_t result, size_t input_rows_count) const override {
1575
11
        auto* state = reinterpret_cast<UnicodeNormalizeState*>(
1576
11
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
1577
11
        if (state == nullptr || state->normalizer == nullptr) {
1578
0
            return Status::RuntimeError("unicode_normalize function state is not initialized");
1579
0
        }
1580
1581
11
        ColumnPtr col =
1582
11
                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
1583
11
        const auto* col_str = check_and_get_column<ColumnString>(col.get());
1584
11
        if (col_str == nullptr) {
1585
0
            return Status::RuntimeError("Illegal column {} of argument of function {}",
1586
0
                                        block.get_by_position(arguments[0]).column->get_name(),
1587
0
                                        get_name());
1588
0
        }
1589
1590
11
        const auto& data = col_str->get_chars();
1591
11
        const auto& offsets = col_str->get_offsets();
1592
1593
11
        auto res = ColumnString::create();
1594
11
        auto& res_data = res->get_chars();
1595
11
        auto& res_offsets = res->get_offsets();
1596
1597
11
        size_t rows = offsets.size();
1598
11
        res_offsets.resize(rows);
1599
1600
11
        std::string tmp;
1601
22
        for (size_t i = 0; i < rows; ++i) {
1602
11
            const char* begin = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
1603
11
            size_t len = offsets[i] - offsets[i - 1];
1604
1605
11
            normalize_one(state->normalizer, begin, len, tmp);
1606
11
            StringOP::push_value_string(tmp, i, res_data, res_offsets);
1607
11
        }
1608
1609
11
        block.replace_by_position(result, std::move(res));
1610
11
        return Status::OK();
1611
11
    }
1612
1613
private:
1614
    struct UnicodeNormalizeState {
1615
        const icu::Normalizer2* normalizer = nullptr;
1616
    };
1617
1618
    static void normalize_one(const icu::Normalizer2* normalizer, const char* input, size_t length,
1619
11
                              std::string& output) {
1620
11
        if (length == 0) {
1621
1
            output.clear();
1622
1
            return;
1623
1
        }
1624
1625
10
        icu::StringPiece sp(input, static_cast<int32_t>(length));
1626
10
        icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(sp);
1627
1628
10
        UErrorCode status = U_ZERO_ERROR;
1629
10
        UNormalizationCheckResult quick = normalizer->quickCheck(src16, status);
1630
10
        if (U_SUCCESS(status) && quick == UNORM_YES) {
1631
4
            output.assign(input, length);
1632
4
            return;
1633
4
        }
1634
1635
6
        icu::UnicodeString result16;
1636
6
        status = U_ZERO_ERROR;
1637
6
        normalizer->normalize(src16, result16, status);
1638
6
        if (U_FAILURE(status)) {
1639
0
            output.assign(input, length);
1640
0
            return;
1641
0
        }
1642
1643
6
        output.clear();
1644
6
        result16.toUTF8String(output);
1645
6
    }
1646
};
1647
1648
using FunctionMakeSet = FunctionNeedsToHandleNull<MakeSetImpl, PrimitiveType::TYPE_STRING>;
1649
1650
8
void register_function_string_misc(SimpleFunctionFactory& factory) {
1651
8
    factory.register_function<FunctionAutoPartitionName>();
1652
8
    factory.register_function<FunctionConvertTo>();
1653
8
    factory.register_function<FunctionIntToChar>();
1654
8
    factory.register_function<FunctionRandomBytes>();
1655
8
    factory.register_function<FunctionTranslate>();
1656
8
    factory.register_function<FunctionNgramSearch>();
1657
8
    factory.register_function<FunctionXPathString>();
1658
8
    factory.register_function<FunctionCrc32Internal>();
1659
8
    factory.register_function<FunctionMakeSet>();
1660
8
    factory.register_function<FunctionExportSet>();
1661
8
    factory.register_function<FunctionUnicodeNormalize>();
1662
8
}
1663
1664
#include "common/compile_check_avoid_end.h"
1665
} // namespace doris