be/src/exprs/function/cast/cast_base.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exprs/function/cast/cast_base.h" |
19 | | |
20 | | #include <cstdint> |
21 | | |
22 | | #include "util/jsonb_writer.h" |
23 | | namespace doris::CastWrapper { |
24 | | |
25 | | Status cast_from_generic_to_jsonb(FunctionContext* context, Block& block, |
26 | | const ColumnNumbers& arguments, uint32_t result, |
27 | 32 | size_t input_rows_count, const NullMap::value_type* null_map) { |
28 | 32 | auto data_type_to = block.get_by_position(result).type; |
29 | 32 | const auto& col_with_type_and_name = block.get_by_position(arguments[0]); |
30 | 32 | const IDataType& type = *col_with_type_and_name.type; |
31 | 32 | const IColumn& col_from = *col_with_type_and_name.column; |
32 | | |
33 | 32 | auto column_string = ColumnString::create(); |
34 | 32 | JsonbWriter writer; |
35 | | |
36 | 32 | ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(col_from.size(), 0); |
37 | 32 | ColumnUInt8::Container* vec_null_map_to = &col_null_map_to->get_data(); |
38 | 32 | DataTypeSerDe::FormatOptions format_options; |
39 | 32 | format_options.converted_from_string = true; |
40 | 32 | DataTypeSerDeSPtr from_serde = type.get_serde(); |
41 | 32 | DataTypeSerDeSPtr to_serde = data_type_to->get_serde(); |
42 | 32 | auto col_to = data_type_to->create_column(); |
43 | | |
44 | 32 | auto tmp_col = ColumnString::create(); |
45 | 32 | DataTypeSerDe::FormatOptions options; |
46 | 32 | auto time_zone = cctz::utc_time_zone(); |
47 | 32 | options.timezone = |
48 | 32 | (context && context->state()) ? &context->state()->timezone_obj() : &time_zone; |
49 | | |
50 | 32 | options.escape_char = '\\'; |
51 | 16.4k | for (size_t i = 0; i < input_rows_count; i++) { |
52 | | // convert to string |
53 | 16.4k | tmp_col->clear(); |
54 | 16.4k | VectorBufferWriter write_buffer(*tmp_col.get()); |
55 | 16.4k | Status st = from_serde->serialize_column_to_json(col_from, i, i + 1, write_buffer, options); |
56 | | // if serialized failed, will return null |
57 | 16.4k | (*vec_null_map_to)[i] = !st.ok(); |
58 | 16.4k | if (!st.ok()) { |
59 | 0 | col_to->insert_default(); |
60 | 0 | continue; |
61 | 0 | } |
62 | 16.4k | write_buffer.commit(); |
63 | 16.4k | writer.reset(); |
64 | 16.4k | auto str_ref = tmp_col->get_data_at(0); |
65 | 16.4k | Slice data((char*)(str_ref.data), str_ref.size); |
66 | | // first try to parse string |
67 | 16.4k | st = to_serde->deserialize_one_cell_from_json(*col_to, data, format_options); |
68 | | // if parsing failed, will return null |
69 | 16.4k | (*vec_null_map_to)[i] = !st.ok(); |
70 | 16.4k | if (!st.ok()) { |
71 | 0 | col_to->insert_default(); |
72 | 0 | } |
73 | 16.4k | } |
74 | | |
75 | 32 | block.replace_by_position( |
76 | 32 | result, ColumnNullable::create(std::move(col_to), std::move(col_null_map_to))); |
77 | 32 | return Status::OK(); |
78 | 32 | } |
79 | | |
80 | | Status cast_from_string_to_generic(FunctionContext* context, Block& block, |
81 | | const ColumnNumbers& arguments, uint32_t result, |
82 | 11 | size_t input_rows_count, const NullMap::value_type* null_map) { |
83 | 11 | const auto& col_with_type_and_name = block.get_by_position(arguments[0]); |
84 | 11 | const IColumn& col_from = *col_with_type_and_name.column; |
85 | | // result column must set type |
86 | 11 | DCHECK(block.get_by_position(result).type != nullptr); |
87 | 11 | auto data_type_to = block.get_by_position(result).type; |
88 | 11 | if (const auto* col_from_string = check_and_get_column<ColumnString>(&col_from)) { |
89 | 11 | auto col_to = data_type_to->create_column(); |
90 | 11 | auto serde = data_type_to->get_serde(); |
91 | 11 | size_t size = col_from.size(); |
92 | 11 | col_to->reserve(size); |
93 | | |
94 | 11 | ColumnUInt8::MutablePtr col_null_map_to = ColumnUInt8::create(size, 0); |
95 | 11 | ColumnUInt8::Container* vec_null_map_to = &col_null_map_to->get_data(); |
96 | 11 | const bool is_complex = is_complex_type(data_type_to->get_primitive_type()); |
97 | 11 | DataTypeSerDe::FormatOptions format_options; |
98 | 11 | format_options.converted_from_string = true; |
99 | 11 | format_options.escape_char = '\\'; |
100 | | |
101 | 65 | for (size_t i = 0; i < size; ++i) { |
102 | 54 | const auto& val = col_from_string->get_data_at(i); |
103 | | // Note: here we should handle the null element |
104 | 54 | if (val.size == 0) { |
105 | 2 | col_to->insert_default(); |
106 | | // empty string('') is an invalid format for complex type, set null_map to 1 |
107 | 2 | if (is_complex) { |
108 | 0 | (*vec_null_map_to)[i] = 1; |
109 | 0 | } |
110 | 2 | continue; |
111 | 2 | } |
112 | 52 | Slice string_slice(val.data, val.size); |
113 | 52 | Status st = |
114 | 52 | serde->deserialize_one_cell_from_json(*col_to, string_slice, format_options); |
115 | | // if parsing failed, will return null |
116 | 52 | (*vec_null_map_to)[i] = !st.ok(); |
117 | 52 | if (!st.ok()) { |
118 | 0 | col_to->insert_default(); |
119 | 0 | } |
120 | 52 | } |
121 | 11 | block.get_by_position(result).column = |
122 | 11 | ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); |
123 | 11 | } else { |
124 | 0 | return Status::RuntimeError( |
125 | 0 | "Illegal column {} of first argument of conversion function from string", |
126 | 0 | col_from.get_name()); |
127 | 0 | } |
128 | 11 | return Status::OK(); |
129 | 11 | } |
130 | | |
131 | | ElementWrappers get_element_wrappers(FunctionContext* context, const DataTypes& from_element_types, |
132 | 4.80k | const DataTypes& to_element_types) { |
133 | 4.80k | DCHECK(from_element_types.size() == to_element_types.size()); |
134 | 4.80k | ElementWrappers element_wrappers; |
135 | 4.80k | element_wrappers.reserve(from_element_types.size()); |
136 | 18.2k | for (size_t i = 0; i < from_element_types.size(); ++i) { |
137 | 13.4k | const DataTypePtr& from_element_type = from_element_types[i]; |
138 | 13.4k | const DataTypePtr& to_element_type = to_element_types[i]; |
139 | 13.4k | element_wrappers.push_back( |
140 | 13.4k | prepare_unpack_dictionaries(context, from_element_type, to_element_type)); |
141 | 13.4k | } |
142 | 4.80k | return element_wrappers; |
143 | 4.80k | } |
144 | | |
145 | 84 | WrapperType create_unsupport_wrapper(const String error_msg) { |
146 | 84 | return [error_msg](FunctionContext* /*context*/, Block& /*block*/, |
147 | 84 | const ColumnNumbers& /*arguments*/, uint32_t /*result*/, |
148 | 84 | size_t /*input_rows_count*/, const NullMap::value_type* null_map = nullptr) { |
149 | 84 | return Status::InvalidArgument(error_msg); |
150 | 84 | }; |
151 | 84 | } |
152 | | |
153 | 0 | WrapperType create_unsupport_wrapper(const String from_type_name, const String to_type_name) { |
154 | 0 | const String error_msg = |
155 | 0 | fmt::format("Conversion from {} to {} is not supported", from_type_name, to_type_name); |
156 | 0 | return create_unsupport_wrapper(error_msg); |
157 | 0 | } |
158 | | |
159 | 75.2k | WrapperType create_identity_wrapper(const DataTypePtr&) { |
160 | 75.2k | return [](FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
161 | 75.2k | uint32_t result, size_t /*input_rows_count*/, |
162 | 75.2k | const NullMap::value_type* null_map = nullptr) { |
163 | 75.2k | block.get_by_position(result).column = block.get_by_position(arguments.front()).column; |
164 | 75.2k | return Status::OK(); |
165 | 75.2k | }; |
166 | 75.2k | } |
167 | | |
168 | | /// the only difference between these two functions is throw error or not when parsing fail. |
169 | | /// the return columns are both nullable columns. |
170 | | Status cast_from_string_to_complex_type(FunctionContext* context, Block& block, |
171 | | const ColumnNumbers& arguments, uint32_t result, |
172 | | size_t input_rows_count, |
173 | 4.25k | const NullMap::value_type* null_map) { |
174 | 4.25k | const auto* col_from = check_and_get_column<DataTypeString::ColumnType>( |
175 | 4.25k | block.get_by_position(arguments[0]).column.get()); |
176 | | |
177 | 4.25k | auto to_type = block.get_by_position(result).type; |
178 | 4.25k | auto to_serde = remove_nullable(to_type)->get_serde(); |
179 | | |
180 | | // string to complex type is always nullable |
181 | 4.25k | MutableColumnPtr to_column = make_nullable(to_type)->create_column(); |
182 | 4.25k | auto& nullable_col_to = assert_cast<ColumnNullable&>(*to_column); |
183 | 4.25k | auto& nested_column = nullable_col_to.get_nested_column(); |
184 | | |
185 | 4.25k | DataTypeSerDe::FormatOptions options; |
186 | 4.25k | options.converted_from_string = true; |
187 | 4.25k | options.escape_char = '\\'; |
188 | 4.25k | options.timezone = &context->state()->timezone_obj(); |
189 | | |
190 | 1.32M | for (size_t i = 0; i < input_rows_count; ++i) { |
191 | 1.32M | if (null_map && null_map[i]) { |
192 | 26.3k | nullable_col_to.insert_default(); |
193 | 1.29M | } else { |
194 | 1.29M | auto str = col_from->get_data_at(i); |
195 | 1.29M | Status st = to_serde->from_string(str, nested_column, options); |
196 | 1.29M | if (st.ok()) { |
197 | 815k | nullable_col_to.get_null_map_data().push_back(0); |
198 | 815k | } else { |
199 | 483k | nullable_col_to.insert_default(); // fill null if fail |
200 | 483k | } |
201 | 1.29M | } |
202 | 1.32M | } |
203 | | |
204 | 4.25k | block.get_by_position(result).column = std::move(to_column); |
205 | 4.25k | return Status::OK(); |
206 | 4.25k | } |
207 | | |
208 | | Status cast_from_string_to_complex_type_strict_mode(FunctionContext* context, Block& block, |
209 | | const ColumnNumbers& arguments, uint32_t result, |
210 | | size_t input_rows_count, |
211 | 7.83k | const NullMap::value_type* null_map) { |
212 | 7.83k | const auto* col_from = check_and_get_column<DataTypeString::ColumnType>( |
213 | 7.83k | block.get_by_position(arguments[0]).column.get()); |
214 | | |
215 | 7.83k | auto to_type = block.get_by_position(result).type; |
216 | 7.83k | auto to_serde = remove_nullable(to_type)->get_serde(); |
217 | | |
218 | | // string to complex type is always nullable |
219 | 7.83k | MutableColumnPtr to_column = make_nullable(to_type)->create_column(); |
220 | 7.83k | auto& nullable_col_to = assert_cast<ColumnNullable&>(*to_column); |
221 | 7.83k | auto& nested_column = nullable_col_to.get_nested_column(); |
222 | | |
223 | 7.83k | DataTypeSerDe::FormatOptions options; |
224 | 7.83k | options.converted_from_string = true; |
225 | 7.83k | options.escape_char = '\\'; |
226 | 7.83k | options.timezone = &context->state()->timezone_obj(); |
227 | | |
228 | 15.6k | for (size_t i = 0; i < input_rows_count; ++i) { |
229 | 7.83k | if (null_map && null_map[i]) { |
230 | 0 | to_column->insert_default(); |
231 | 7.83k | } else { |
232 | 7.83k | auto str = col_from->get_data_at(i); |
233 | 7.83k | RETURN_IF_ERROR(to_serde->from_string_strict_mode(str, nested_column, options)); |
234 | | // fill not null if success |
235 | 7.81k | nullable_col_to.get_null_map_data().push_back(0); |
236 | 7.81k | } |
237 | 7.83k | } |
238 | 7.81k | block.get_by_position(result).column = std::move(to_column); |
239 | 7.81k | return Status::OK(); |
240 | 7.83k | } |
241 | | |
242 | | } // namespace doris::CastWrapper |