be/src/exprs/function/function_json.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <glog/logging.h> |
19 | | #include <rapidjson/allocators.h> |
20 | | #include <rapidjson/document.h> |
21 | | #include <rapidjson/rapidjson.h> |
22 | | #include <rapidjson/stringbuffer.h> |
23 | | #include <rapidjson/writer.h> |
24 | | |
25 | | #include <memory> |
26 | | #include <string_view> |
27 | | #include <utility> |
28 | | #include <vector> |
29 | | |
30 | | #include "common/cast_set.h" |
31 | | #include "common/compiler_util.h" // IWYU pragma: keep |
32 | | #include "common/status.h" |
33 | | #include "core/assert_cast.h" |
34 | | #include "core/block/block.h" |
35 | | #include "core/block/column_numbers.h" |
36 | | #include "core/block/column_with_type_and_name.h" |
37 | | #include "core/column/column.h" |
38 | | #include "core/column/column_nullable.h" |
39 | | #include "core/column/column_string.h" |
40 | | #include "core/column/column_vector.h" |
41 | | #include "core/data_type/data_type.h" |
42 | | #include "core/data_type/data_type_nullable.h" |
43 | | #include "core/data_type/data_type_number.h" |
44 | | #include "core/data_type/data_type_string.h" |
45 | | #include "core/string_ref.h" |
46 | | #include "core/types.h" |
47 | | #include "core/value/jsonb_value.h" |
48 | | #include "exprs/function/function.h" |
49 | | #include "exprs/function/simple_function_factory.h" |
50 | | |
51 | | namespace doris { |
52 | | class FunctionContext; |
53 | | } // namespace doris |
54 | | |
55 | | namespace doris { |
56 | | struct FunctionJsonQuoteImpl { |
57 | | static constexpr auto name = "json_quote"; |
58 | | |
59 | 7 | static DataTypePtr get_return_type_impl(const DataTypes& arguments) { |
60 | 7 | if (!arguments.empty() && arguments[0] && arguments[0]->is_nullable()) { |
61 | 0 | return make_nullable(std::make_shared<DataTypeString>()); |
62 | 0 | } |
63 | 7 | return std::make_shared<DataTypeString>(); |
64 | 7 | } |
65 | | static void execute(const std::vector<const ColumnString*>& data_columns, |
66 | 16 | ColumnString& result_column, size_t input_rows_count) { |
67 | 16 | rapidjson::Document document; |
68 | 16 | rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); |
69 | | |
70 | 16 | rapidjson::Value value; |
71 | | |
72 | 16 | rapidjson::StringBuffer buf; |
73 | | |
74 | 42 | for (int i = 0; i < input_rows_count; i++) { |
75 | 26 | StringRef data = data_columns[0]->get_data_at(i); |
76 | 26 | value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator); |
77 | | |
78 | 26 | buf.Clear(); |
79 | 26 | rapidjson::Writer<rapidjson::StringBuffer> writer(buf); |
80 | 26 | value.Accept(writer); |
81 | 26 | result_column.insert_data(buf.GetString(), buf.GetSize()); |
82 | 26 | } |
83 | 16 | } |
84 | | }; |
85 | | |
86 | | template <typename Impl> |
87 | | class FunctionJson : public IFunction { |
88 | | public: |
89 | | static constexpr auto name = Impl::name; |
90 | | |
91 | 13 | static FunctionPtr create() { return std::make_shared<FunctionJson<Impl>>(); } |
92 | | |
93 | 0 | String get_name() const override { return name; } |
94 | | |
95 | 0 | size_t get_number_of_arguments() const override { return 0; } |
96 | | |
97 | 8 | bool is_variadic() const override { return true; } |
98 | | |
99 | 7 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
100 | 7 | return Impl::get_return_type_impl(arguments); |
101 | 7 | } |
102 | | |
103 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
104 | 16 | uint32_t result, size_t input_rows_count) const override { |
105 | 16 | auto result_column = ColumnString::create(); |
106 | | |
107 | 16 | std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct |
108 | 16 | std::vector<const ColumnString*> data_columns; |
109 | 16 | for (unsigned int argument : arguments) { |
110 | 16 | column_ptrs.push_back( |
111 | 16 | block.get_by_position(argument).column->convert_to_full_column_if_const()); |
112 | 16 | data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get())); |
113 | 16 | } |
114 | | |
115 | 16 | Impl::execute(data_columns, *result_column.get(), input_rows_count); |
116 | 16 | block.get_by_position(result).column = std::move(result_column); |
117 | 16 | return Status::OK(); |
118 | 16 | } |
119 | | }; |
120 | | |
121 | | class FunctionJsonValid : public IFunction { |
122 | | public: |
123 | | static constexpr auto name = "json_valid"; |
124 | 22 | static FunctionPtr create() { return std::make_shared<FunctionJsonValid>(); } |
125 | | |
126 | 1 | String get_name() const override { return name; } |
127 | | |
128 | 16 | size_t get_number_of_arguments() const override { return 1; } |
129 | | |
130 | 16 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
131 | 16 | return make_nullable(std::make_shared<DataTypeInt32>()); |
132 | 16 | } |
133 | | |
134 | 68 | bool use_default_implementation_for_nulls() const override { return false; } |
135 | | |
136 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
137 | 52 | uint32_t result, size_t input_rows_count) const override { |
138 | 52 | const IColumn& col_from = *(block.get_by_position(arguments[0]).column); |
139 | | |
140 | 52 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
141 | | |
142 | 52 | const ColumnUInt8::Container* input_null_map = nullptr; |
143 | 52 | const ColumnString* col_from_string = nullptr; |
144 | 52 | if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) { |
145 | 34 | input_null_map = &nullable->get_null_map_data(); |
146 | 34 | col_from_string = |
147 | 34 | check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr()); |
148 | 34 | } else { |
149 | 18 | col_from_string = check_and_get_column<ColumnString>(col_from); |
150 | 18 | } |
151 | | |
152 | 52 | if (!col_from_string) { |
153 | 0 | return Status::RuntimeError("Illegal column {} should be ColumnString", |
154 | 0 | col_from.get_name()); |
155 | 0 | } |
156 | | |
157 | 52 | auto col_to = ColumnInt32::create(); |
158 | 52 | auto& vec_to = col_to->get_data(); |
159 | 52 | size_t size = col_from.size(); |
160 | 52 | vec_to.resize(size); |
161 | | |
162 | | // parser can be reused for performance |
163 | | |
164 | 52 | auto input_type = block.get_by_position(arguments[0]).type->get_primitive_type(); |
165 | | |
166 | 52 | if (input_type == PrimitiveType::TYPE_VARCHAR || input_type == PrimitiveType::TYPE_CHAR || |
167 | 52 | input_type == PrimitiveType::TYPE_STRING) { |
168 | 12 | JsonBinaryValue jsonb_value; |
169 | 24 | for (size_t i = 0; i < input_rows_count; ++i) { |
170 | 12 | if (input_null_map && (*input_null_map)[i]) { |
171 | 4 | null_map->get_data()[i] = 1; |
172 | 4 | vec_to[i] = 0; |
173 | 4 | continue; |
174 | 4 | } |
175 | | |
176 | 8 | const auto& val = col_from_string->get_data_at(i); |
177 | 8 | if (jsonb_value.from_json_string(val.data, cast_set<unsigned int>(val.size)).ok()) { |
178 | 4 | vec_to[i] = 1; |
179 | 4 | } else { |
180 | 4 | vec_to[i] = 0; |
181 | 4 | } |
182 | 8 | } |
183 | | |
184 | 40 | } else { |
185 | 40 | DCHECK(input_type == PrimitiveType::TYPE_JSONB); |
186 | 143 | for (size_t i = 0; i < input_rows_count; ++i) { |
187 | 103 | if (input_null_map && (*input_null_map)[i]) { |
188 | 6 | null_map->get_data()[i] = 1; |
189 | 6 | vec_to[i] = 0; |
190 | 6 | continue; |
191 | 6 | } |
192 | 97 | const auto& val = col_from_string->get_data_at(i); |
193 | 97 | if (val.size == 0) { |
194 | 0 | vec_to[i] = 0; |
195 | 0 | continue; |
196 | 0 | } |
197 | 97 | const JsonbDocument* doc = nullptr; |
198 | 97 | auto st = JsonbDocument::checkAndCreateDocument(val.data, val.size, &doc); |
199 | 97 | if (!st.ok() || !doc || !doc->getValue()) [[unlikely]] { |
200 | 0 | vec_to[i] = 0; |
201 | 0 | continue; |
202 | 0 | } |
203 | 97 | const JsonbValue* value = doc->getValue(); |
204 | 97 | if (UNLIKELY(!value)) { |
205 | 0 | vec_to[i] = 0; |
206 | 0 | continue; |
207 | 0 | } |
208 | 97 | vec_to[i] = 1; |
209 | 97 | } |
210 | 40 | } |
211 | | |
212 | 52 | block.replace_by_position(result, |
213 | 52 | ColumnNullable::create(std::move(col_to), std::move(null_map))); |
214 | | |
215 | 52 | return Status::OK(); |
216 | 52 | } |
217 | | }; |
218 | | class FunctionJsonUnquote : public IFunction { |
219 | | public: |
220 | | static constexpr auto name = "json_unquote"; |
221 | 18 | static FunctionPtr create() { return std::make_shared<FunctionJsonUnquote>(); } |
222 | | |
223 | 1 | String get_name() const override { return name; } |
224 | | |
225 | 12 | size_t get_number_of_arguments() const override { return 1; } |
226 | | |
227 | 12 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
228 | 12 | return make_nullable(std::make_shared<DataTypeString>()); |
229 | 12 | } |
230 | | |
231 | 24 | bool use_default_implementation_for_nulls() const override { return false; } |
232 | | |
233 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
234 | 12 | uint32_t result, size_t input_rows_count) const override { |
235 | 12 | const IColumn& col_from = *(block.get_by_position(arguments[0]).column); |
236 | | |
237 | 12 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
238 | | |
239 | 12 | const auto* col_from_string = check_and_get_column<ColumnString>(col_from); |
240 | 12 | if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) { |
241 | 4 | col_from_string = |
242 | 4 | check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr()); |
243 | 4 | } |
244 | | |
245 | 12 | if (!col_from_string) { |
246 | 0 | return Status::RuntimeError("Illegal column {} should be ColumnString", |
247 | 0 | col_from.get_name()); |
248 | 0 | } |
249 | | |
250 | 12 | auto col_to = ColumnString::create(); |
251 | 12 | col_to->reserve(input_rows_count); |
252 | | |
253 | | // parser can be reused for performance |
254 | 12 | rapidjson::Document document; |
255 | 24 | for (size_t i = 0; i < input_rows_count; ++i) { |
256 | 12 | if (col_from.is_null_at(i)) { |
257 | 3 | null_map->get_data()[i] = 1; |
258 | 3 | col_to->insert_data(nullptr, 0); |
259 | 3 | continue; |
260 | 3 | } |
261 | | |
262 | 9 | const auto& json_str = col_from_string->get_data_at(i); |
263 | 9 | if (json_str.size < 2 || json_str.data[0] != '"' || |
264 | 9 | json_str.data[json_str.size - 1] != '"') { |
265 | | // non-quoted string |
266 | 6 | col_to->insert_data(json_str.data, json_str.size); |
267 | 6 | } else { |
268 | 3 | document.Parse(json_str.data, json_str.size); |
269 | 3 | if (document.HasParseError() || !document.IsString()) { |
270 | 0 | return Status::RuntimeError( |
271 | 0 | fmt::format("Invalid JSON text in argument 1 to function {}: {}", name, |
272 | 0 | std::string_view(json_str.data, json_str.size))); |
273 | 0 | } |
274 | 3 | col_to->insert_data(document.GetString(), document.GetStringLength()); |
275 | 3 | } |
276 | 9 | } |
277 | | |
278 | 12 | block.replace_by_position(result, |
279 | 12 | ColumnNullable::create(std::move(col_to), std::move(null_map))); |
280 | | |
281 | 12 | return Status::OK(); |
282 | 12 | } |
283 | | }; |
284 | | |
285 | 5 | void register_function_json(SimpleFunctionFactory& factory) { |
286 | 5 | factory.register_function<FunctionJsonUnquote>(); |
287 | | |
288 | 5 | factory.register_function<FunctionJson<FunctionJsonQuoteImpl>>(); |
289 | | |
290 | 5 | factory.register_function<FunctionJsonValid>(); |
291 | 5 | } |
292 | | |
293 | | } // namespace doris |