be/src/exprs/function/function_uuid.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <cctype> |
19 | | #include <cstddef> |
20 | | #include <cstring> |
21 | | #include <memory> |
22 | | #include <utility> |
23 | | |
24 | | #include "common/status.h" |
25 | | #include "core/assert_cast.h" |
26 | | #include "core/block/block.h" |
27 | | #include "core/block/column_numbers.h" |
28 | | #include "core/block/column_with_type_and_name.h" |
29 | | #include "core/column/column.h" |
30 | | #include "core/column/column_nullable.h" |
31 | | #include "core/column/column_string.h" |
32 | | #include "core/column/column_vector.h" |
33 | | #include "core/data_type/data_type.h" |
34 | | #include "core/data_type/data_type_nullable.h" |
35 | | #include "core/data_type/data_type_number.h" |
36 | | #include "core/data_type/data_type_string.h" |
37 | | #include "core/types.h" |
38 | | #include "exprs/aggregate/aggregate_function.h" |
39 | | #include "exprs/function/function.h" |
40 | | #include "exprs/function/simple_function_factory.h" |
41 | | |
42 | | namespace doris { |
43 | | class FunctionContext; |
44 | | } // namespace doris |
45 | | |
46 | | namespace doris { |
47 | | constexpr static std::array<int, 5> SPLIT_POS = {8, 13, 18, 23, 36}; // 8-4-4-4-12 |
48 | | constexpr static char DELIMITER = '-'; |
49 | | |
50 | | class FunctionUuidtoInt : public IFunction { |
51 | | public: |
52 | | static constexpr auto name = "uuid_to_int"; |
53 | | |
54 | 25 | static FunctionPtr create() { return std::make_shared<FunctionUuidtoInt>(); } |
55 | | |
56 | 1 | String get_name() const override { return name; } |
57 | | |
58 | 16 | size_t get_number_of_arguments() const override { return 1; } |
59 | | |
60 | 16 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
61 | 16 | return make_nullable(std::make_shared<DataTypeInt128>()); |
62 | 16 | } |
63 | | |
64 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
65 | 12 | uint32_t result, size_t input_rows_count) const override { |
66 | 12 | const auto& arg_column = |
67 | 12 | assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column); |
68 | | |
69 | 12 | auto result_column = ColumnInt128::create(input_rows_count); |
70 | 12 | auto& result_data = result_column->get_data(); |
71 | 12 | auto null_column = ColumnUInt8::create(input_rows_count); |
72 | 12 | auto& null_map = null_column->get_data(); |
73 | | |
74 | 29 | for (int row = 0; row < input_rows_count; row++) { |
75 | 17 | auto str = arg_column.get_data_at(row); |
76 | 17 | const auto* data = str.data; |
77 | 17 | Int128* result_cell = &result_data[row]; |
78 | 17 | *result_cell = 0; |
79 | 17 | null_map[row] = false; |
80 | | |
81 | 17 | if (str.size == 36) { |
82 | 12 | if (data[SPLIT_POS[0]] != DELIMITER || data[SPLIT_POS[1]] != DELIMITER || |
83 | 12 | data[SPLIT_POS[2]] != DELIMITER || data[SPLIT_POS[3]] != DELIMITER) { |
84 | 0 | null_map[row] = true; |
85 | 0 | continue; |
86 | 0 | } |
87 | 12 | char new_data[32]; |
88 | 12 | memset(new_data, 0, sizeof(new_data)); |
89 | | // ignore '-' |
90 | 12 | memcpy(new_data, data, 8); |
91 | 12 | memcpy(new_data + 8, data + SPLIT_POS[0] + 1, 4); |
92 | 12 | memcpy(new_data + 12, data + SPLIT_POS[1] + 1, 4); |
93 | 12 | memcpy(new_data + 16, data + SPLIT_POS[2] + 1, 4); |
94 | 12 | memcpy(new_data + 20, data + SPLIT_POS[3] + 1, 12); |
95 | | |
96 | 12 | if (!serialize(new_data, (char*)result_cell, 32)) { |
97 | 0 | null_map[row] = true; |
98 | 0 | continue; |
99 | 0 | } |
100 | 12 | } else if (str.size == 32) { |
101 | 2 | if (!serialize(data, (char*)result_cell, 32)) { |
102 | 0 | null_map[row] = true; |
103 | 0 | continue; |
104 | 0 | } |
105 | 3 | } else { |
106 | 3 | null_map[row] = true; |
107 | 3 | continue; |
108 | 3 | } |
109 | 17 | } |
110 | | |
111 | 12 | block.replace_by_position( |
112 | 12 | result, ColumnNullable::create(std::move(result_column), std::move(null_column))); |
113 | 12 | return Status::OK(); |
114 | 12 | } |
115 | | |
116 | | // use char* to write dst is the only legal way by 'restrict aliasing rule' |
117 | 14 | static bool serialize(const char* __restrict src, char* __restrict dst, size_t length) { |
118 | 14 | char target; // 8bit, contains 2 char input |
119 | 448 | auto translate = [&target](const char ch) { |
120 | 448 | if (isdigit(ch)) { |
121 | 294 | target += ch - '0'; |
122 | 294 | } else if (ch >= 'a' && ch <= 'f') { |
123 | 127 | target += ch - 'a' + 10; |
124 | 127 | } else if (ch >= 'A' && ch <= 'F') { |
125 | 27 | target += ch - 'A' + 10; |
126 | 27 | } else { |
127 | 0 | return false; |
128 | 0 | } |
129 | 448 | return true; |
130 | 448 | }; |
131 | | |
132 | 14 | bool ok = true; |
133 | 238 | for (size_t i = 0; i < length; i += 2, src++, dst++) { |
134 | 224 | target = 0; |
135 | 224 | if (!translate(*src)) { |
136 | 0 | ok = false; // dont break for auto-simd |
137 | 0 | } |
138 | | |
139 | 224 | src++; |
140 | 224 | target <<= 4; |
141 | 224 | if (!translate(*src)) { |
142 | 0 | ok = false; |
143 | 0 | } |
144 | 224 | *dst = target; |
145 | 224 | } |
146 | | |
147 | 14 | return ok; |
148 | 14 | } |
149 | | }; |
150 | | |
151 | | class FunctionInttoUuid : public IFunction { |
152 | | public: |
153 | | static constexpr auto name = "int_to_uuid"; |
154 | | |
155 | 17 | static FunctionPtr create() { return std::make_shared<FunctionInttoUuid>(); } |
156 | | |
157 | 1 | String get_name() const override { return name; } |
158 | | |
159 | 8 | size_t get_number_of_arguments() const override { return 1; } |
160 | | |
161 | 8 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
162 | 8 | return std::make_shared<DataTypeString>(); |
163 | 8 | } |
164 | | |
165 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
166 | 7 | uint32_t result, size_t input_rows_count) const override { |
167 | 7 | const auto& arg_column = |
168 | 7 | assert_cast<const ColumnInt128&>(*block.get_by_position(arguments[0]).column); |
169 | 7 | auto result_column = ColumnString::create(); |
170 | 7 | constexpr int str_length = 36; |
171 | 7 | auto& col_data = result_column->get_chars(); |
172 | 7 | auto& col_offset = result_column->get_offsets(); |
173 | 7 | col_data.resize(str_length * input_rows_count + |
174 | 7 | 1); // for branchless deserialize, we occupy one more byte for the last '-' |
175 | 7 | col_offset.resize(input_rows_count); |
176 | | |
177 | 17 | for (int row = 0; row < input_rows_count; row++) { |
178 | 10 | const Int128* arg = &arg_column.get_data()[row]; |
179 | 10 | col_offset[row] = col_offset[row - 1] + str_length; |
180 | 10 | deserialize((char*)arg, col_data.data() + str_length * row); |
181 | 10 | } |
182 | 7 | col_data.resize(str_length * input_rows_count); |
183 | 7 | block.replace_by_position(result, std::move(result_column)); |
184 | 7 | return Status::OK(); |
185 | 7 | } |
186 | | |
187 | | // use char* to read src is the only legal way by 'restrict aliasing rule' |
188 | 10 | static void deserialize(const char* __restrict src, unsigned char* __restrict dst) { |
189 | 320 | auto transform = [](char ch) -> unsigned char { |
190 | 320 | if (ch < 10) { |
191 | 211 | return ch + '0'; |
192 | 211 | } else { |
193 | 109 | return ch - 10 + 'a'; |
194 | 109 | } |
195 | 320 | }; |
196 | | |
197 | 10 | int j = 0; |
198 | 50 | for (int i : SPLIT_POS) { |
199 | 210 | for (; j < i; src++, j += 2) { // input 16 chars, 2 data per char |
200 | 160 | dst[j] = transform(((*src) >> 4) & 0x0F); |
201 | 160 | dst[j + 1] = transform(*src & 0x0F); |
202 | 160 | } |
203 | 50 | dst[j++] = DELIMITER; // we resized one more byte. |
204 | 50 | } |
205 | 10 | } |
206 | | }; |
207 | | |
208 | 8 | void register_function_uuid_transforms(SimpleFunctionFactory& factory) { |
209 | 8 | factory.register_function<FunctionUuidtoInt>(); |
210 | 8 | factory.register_function<FunctionInttoUuid>(); |
211 | 8 | } |
212 | | |
213 | | } // namespace doris |