Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2014, Facebook, Inc. |
3 | | * All rights reserved. |
4 | | * |
5 | | * This source code is licensed under the BSD-style license found in the |
6 | | * LICENSE file in the root directory of this source tree. An additional grant |
7 | | * of patent rights can be found in the PATENTS file in the same directory. |
8 | | * |
9 | | */ |
10 | | |
11 | | /* |
12 | | * This file defines JsonbParserTSIMD (template) and JsonbParser. |
13 | | * |
14 | | * JsonbParserTSIMD is a template class which implements a JSON parser. |
15 | | * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format |
16 | | * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new |
17 | | * JsonbWriterT object with an output stream object. However, you can also |
18 | | * pass in your JsonbWriterT or any stream object that implements some basic |
19 | | * interface of std::ostream (see JsonbStream.h). |
20 | | * |
21 | | * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see |
22 | | * JsonbStream.h). So unless you want to provide own a different output stream |
23 | | * type, use JsonbParser object. |
24 | | * |
25 | | * ** Parsing JSON ** |
26 | | * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB |
27 | | * packed bytes. There are three ways to parse a JSON string: (1) using |
28 | | * c-string, (2) using string with len, (3) using std::istream object. You can |
29 | | * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used |
30 | | * internally if the input is raw character buffer. |
31 | | * |
32 | | * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON |
33 | | * strings, and the previous JSONB will be overwritten. |
34 | | * |
35 | | * If parsing fails (returned false), the error code will be set to one of |
36 | | * JsonbErrType, and can be retrieved by calling getErrorCode(). |
37 | | * |
38 | | * ** External dictionary ** |
39 | | * During parsing a JSON string, you can pass a call-back function to map a key |
40 | | * string to an id, and store the dictionary id in JSONB to save space. The |
41 | | * purpose of using an external dictionary is more towards a collection of |
42 | | * documents (which has common keys) rather than a single document, so that |
43 | | * space saving will be significant. |
44 | | * |
45 | | * ** Endianness ** |
46 | | * Note: JSONB serialization doesn't assume endianness of the server. However |
47 | | * you will need to ensure that the endianness at the reader side is the same |
48 | | * as that at the writer side (if they are on different machines). Otherwise, |
49 | | * proper conversion is needed when a number value is returned to the |
50 | | * caller/writer. |
51 | | * |
52 | | * @author Tian Xia <tianx@fb.com> |
53 | | * |
54 | | * this file is copied from |
55 | | * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h |
56 | | * and modified by Doris |
57 | | */ |
58 | | |
59 | | #pragma once |
60 | | #include <simdjson.h> |
61 | | |
62 | | #include <cmath> |
63 | | #include <limits> |
64 | | |
65 | | #include "common/status.h" |
66 | | #include "util/jsonb_document.h" |
67 | | #include "util/jsonb_writer.h" |
68 | | #include "util/string_parser.hpp" |
69 | | |
70 | | namespace doris { |
71 | | using int128_t = __int128; |
72 | | struct JsonbParser { |
73 | | // parse a UTF-8 JSON string with length |
74 | | // will reset writer before parse |
75 | 2.69k | static Status parse(const char* pch, size_t len, JsonbWriter& writer) { |
76 | 2.69k | if (!pch || len == 0) { |
77 | 11 | return Status::InvalidArgument("Empty JSON document"); |
78 | 11 | } |
79 | 2.68k | writer.reset(); |
80 | 2.68k | try { |
81 | 2.68k | simdjson::ondemand::parser simdjson_parser; |
82 | 2.68k | simdjson::padded_string json_str {pch, len}; |
83 | 2.68k | simdjson::ondemand::document doc = simdjson_parser.iterate(json_str); |
84 | | |
85 | | // simdjson process top level primitive types specially |
86 | | // so some repeated code here |
87 | 2.68k | bool need_check_at_end = true; |
88 | 2.68k | switch (doc.type()) { |
89 | 417 | case simdjson::ondemand::json_type::object: |
90 | 1.97k | case simdjson::ondemand::json_type::array: { |
91 | 1.97k | RETURN_IF_ERROR(parse(doc.get_value(), writer)); |
92 | 1.70k | break; |
93 | 1.97k | } |
94 | 1.70k | case simdjson::ondemand::json_type::null: { |
95 | 57 | bool is_null = false; |
96 | 57 | simdjson::error_code res = doc.is_null().get(is_null); |
97 | 57 | if (res != simdjson::SUCCESS || !is_null) { |
98 | 6 | return Status::InvalidArgument(fmt::format("simdjson get null failed: {}", |
99 | 6 | simdjson::error_message(res))); |
100 | 6 | } |
101 | 51 | if (writer.writeNull() == 0) { |
102 | 0 | return Status::InvalidArgument("writeNull failed"); |
103 | 0 | } |
104 | 51 | break; |
105 | 51 | } |
106 | 72 | case simdjson::ondemand::json_type::boolean: { |
107 | 72 | if (writer.writeBool(doc.get_bool()) == 0) { |
108 | 0 | return Status::InvalidArgument("writeBool failed"); |
109 | 0 | } |
110 | 72 | break; |
111 | 72 | } |
112 | 300 | case simdjson::ondemand::json_type::string: { |
113 | 300 | RETURN_IF_ERROR(write_string(doc.get_string(), writer)); |
114 | 300 | break; |
115 | 300 | } |
116 | 300 | case simdjson::ondemand::json_type::number: { |
117 | 264 | simdjson::ondemand::number num; |
118 | 264 | simdjson::error_code res = doc.get_number().get(num); |
119 | 264 | if (res == simdjson::error_code::SUCCESS) { |
120 | 248 | RETURN_IF_ERROR( |
121 | 248 | write_number(num, doc.get_number_type(), doc.raw_json_token(), writer)); |
122 | 248 | break; |
123 | 248 | } |
124 | 16 | if (res == simdjson::error_code::NUMBER_ERROR || |
125 | 16 | res == simdjson::error_code::BIGINT_ERROR) { |
126 | 16 | RETURN_IF_ERROR(write_number_from_raw_json(pch, len, writer)); |
127 | 7 | need_check_at_end = false; |
128 | 7 | break; |
129 | 16 | } |
130 | 0 | return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}", |
131 | 0 | simdjson::error_message(res))); |
132 | 16 | } |
133 | 2.68k | } |
134 | 2.36k | if (need_check_at_end && !doc.at_end()) { |
135 | 0 | return Status::InvalidArgument("simdjson parse exception: trailing content"); |
136 | 0 | } |
137 | 2.36k | } catch (simdjson::simdjson_error& e) { |
138 | 30 | return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what())); |
139 | 30 | } |
140 | 2.36k | return Status::OK(); |
141 | 2.68k | } |
142 | | |
143 | | private: |
144 | | // parse json, recursively if necessary, by simdjson |
145 | | // and serialize to binary format by writer |
146 | 561k | static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) { |
147 | 561k | switch (value.type()) { |
148 | 11.7k | case simdjson::ondemand::json_type::null: { |
149 | 11.7k | if (writer.writeNull() == 0) { |
150 | 0 | return Status::InvalidArgument("writeNull failed"); |
151 | 0 | } |
152 | 11.7k | break; |
153 | 11.7k | } |
154 | 20.5k | case simdjson::ondemand::json_type::boolean: { |
155 | 20.5k | if (writer.writeBool(value.get_bool()) == 0) { |
156 | 0 | return Status::InvalidArgument("writeBool failed"); |
157 | 0 | } |
158 | 20.5k | break; |
159 | 20.5k | } |
160 | 250k | case simdjson::ondemand::json_type::string: { |
161 | 250k | RETURN_IF_ERROR(write_string(value.get_string(), writer)); |
162 | 250k | break; |
163 | 250k | } |
164 | 250k | case simdjson::ondemand::json_type::number: { |
165 | 224k | simdjson::ondemand::number num; |
166 | 224k | auto res = value.get_number().get(num); |
167 | 224k | if (res == simdjson::error_code::SUCCESS) { |
168 | 224k | RETURN_IF_ERROR( |
169 | 224k | write_number(num, value.get_number_type(), value.raw_json_token(), writer)); |
170 | 224k | break; |
171 | 224k | } |
172 | 4 | if (res == simdjson::error_code::NUMBER_ERROR || |
173 | 4 | res == simdjson::error_code::BIGINT_ERROR) { |
174 | 4 | RETURN_IF_ERROR(write_number_from_raw_json(value.raw_json_token(), writer)); |
175 | 1 | break; |
176 | 4 | } |
177 | 0 | return Status::InvalidArgument( |
178 | 0 | fmt::format("simdjson get_number failed: {}", simdjson::error_message(res))); |
179 | 4 | } |
180 | 2.67k | case simdjson::ondemand::json_type::object: { |
181 | 2.67k | if (!writer.writeStartObject()) { |
182 | 0 | return Status::InvalidArgument("writeStartObject failed"); |
183 | 0 | } |
184 | | |
185 | 6.20k | for (auto kv : value.get_object()) { |
186 | 6.20k | std::string_view key; |
187 | 6.20k | simdjson::error_code e = kv.unescaped_key().get(key); |
188 | 6.20k | if (e != simdjson::SUCCESS) { |
189 | 2 | return Status::InvalidArgument(fmt::format("simdjson get key failed: {}", e)); |
190 | 2 | } |
191 | | |
192 | | // write key |
193 | 6.19k | if (key.size() > std::numeric_limits<uint8_t>::max()) { |
194 | 268 | return Status::InvalidArgument("key size exceeds max limit: {} , {}", |
195 | 268 | key.size(), std::numeric_limits<uint8_t>::max()); |
196 | 268 | } |
197 | 5.93k | if (!writer.writeKey(key.data(), (uint8_t)key.size())) { |
198 | 0 | return Status::InvalidArgument("writeKey failed : {}", key); |
199 | 0 | } |
200 | | |
201 | | // parse object value |
202 | 5.93k | RETURN_IF_ERROR(parse(kv.value(), writer)); |
203 | 5.93k | } |
204 | | |
205 | 2.31k | if (!writer.writeEndObject()) { |
206 | 0 | return Status::InvalidArgument("writeEndObject failed"); |
207 | 0 | break; |
208 | 0 | } |
209 | | |
210 | 2.31k | break; |
211 | 2.31k | } |
212 | 51.8k | case simdjson::ondemand::json_type::array: { |
213 | 51.8k | if (!writer.writeStartArray()) { |
214 | 0 | return Status::InvalidArgument("writeStartArray failed"); |
215 | 0 | } |
216 | | |
217 | 553k | for (auto elem : value.get_array()) { |
218 | | // parse array element |
219 | 553k | RETURN_IF_ERROR(parse(elem.value(), writer)); |
220 | 553k | } |
221 | | |
222 | 51.6k | if (!writer.writeEndArray()) { |
223 | 0 | return Status::InvalidArgument("writeEndArray failed"); |
224 | 0 | } |
225 | 51.6k | break; |
226 | 51.6k | } |
227 | 51.6k | default: { |
228 | 0 | return Status::InvalidArgument("unknown value type: "); |
229 | 51.6k | } |
230 | | |
231 | 561k | } // end of switch |
232 | 560k | return Status::OK(); |
233 | 561k | } |
234 | | |
235 | 250k | static Status write_string(std::string_view str, JsonbWriter& writer) { |
236 | | // start writing string |
237 | 250k | if (!writer.writeStartString()) { |
238 | 0 | return Status::InvalidArgument("writeStartString failed"); |
239 | 0 | } |
240 | | |
241 | | // write string |
242 | 250k | if (str.size() > 0) { |
243 | 239k | if (writer.writeString(str.data(), str.size()) == 0) { |
244 | 0 | return Status::InvalidArgument("writeString failed"); |
245 | 0 | } |
246 | 239k | } |
247 | | |
248 | | // end writing string |
249 | 250k | if (!writer.writeEndString()) { |
250 | 0 | return Status::InvalidArgument("writeEndString failed"); |
251 | 0 | } |
252 | 250k | return Status::OK(); |
253 | 250k | } |
254 | | |
255 | | static Status write_number(simdjson::ondemand::number num, |
256 | | simdjson ::ondemand::number_type num_type, |
257 | 224k | std::string_view raw_string, JsonbWriter& writer) { |
258 | | // The simdjson library supports four types of numbers: |
259 | | // 1. floating_point_number: A binary64 number, which will be converted to jsonb's double type. |
260 | | // 2. signed_integer: A signed integer that fits in a 64-bit word using two's complement. |
261 | | // 3. unsigned_integer: A positive integer larger or equal to 1<<63. |
262 | | // For these two integer types, we will convert them to jsonb's int8/int16/int32/int64/int128 types according to the specific value. |
263 | | // 4. big_integer: An integer that does not fit in a 64-bit word. |
264 | | // For this type, simdjson cannot handle it directly. We first try to convert it to jsonb's int128 type. |
265 | | // If conversion fails, we attempt to convert it to a double type. |
266 | | // If conversion to double also fails, an error is returned. |
267 | | |
268 | 224k | switch (num_type) { |
269 | 115k | case simdjson::ondemand::number_type::floating_point_number: { |
270 | 115k | double number = num.get_double(); |
271 | | // When a double exceeds the precision that can be represented by a double type in simdjson, it gets converted to 0. |
272 | | // The correct approach, should be to truncate the double value instead. |
273 | 115k | if (number == 0) { |
274 | 13.7k | StringParser::ParseResult result; |
275 | 13.7k | number = StringParser::string_to_float<double>(raw_string.data(), raw_string.size(), |
276 | 13.7k | &result); |
277 | 13.7k | if (result != StringParser::PARSE_SUCCESS) { |
278 | 0 | return Status::InvalidArgument("invalid number, raw string is: " + |
279 | 0 | std::string(raw_string)); |
280 | 0 | } |
281 | 13.7k | } |
282 | 115k | if (!std::isfinite(number)) { |
283 | 0 | return Status::InvalidArgument("invalid number, raw string is: " + |
284 | 0 | std::string(raw_string)); |
285 | 0 | } |
286 | | |
287 | 115k | if (writer.writeDouble(number) == 0) { |
288 | 0 | return Status::InvalidArgument("writeDouble failed"); |
289 | 0 | } |
290 | | |
291 | 115k | break; |
292 | 115k | } |
293 | 115k | case simdjson::ondemand::number_type::signed_integer: |
294 | 109k | case simdjson::ondemand::number_type::unsigned_integer: { |
295 | 109k | int128_t val = num.is_int64() ? (int128_t)num.get_int64() : (int128_t)num.get_uint64(); |
296 | 109k | RETURN_IF_ERROR(write_int128(val, writer)); |
297 | 109k | break; |
298 | 109k | } |
299 | 109k | case simdjson::ondemand::number_type::big_integer: { |
300 | 0 | RETURN_IF_ERROR(write_number_from_raw_json(raw_string, writer)); |
301 | 0 | break; |
302 | 0 | } |
303 | 224k | } |
304 | 224k | return Status::OK(); |
305 | 224k | } |
306 | | |
307 | 42 | static bool is_json_number_space(char c) { |
308 | 42 | return c == ' ' || c == '\t' || c == '\n' || c == '\r'; |
309 | 42 | } |
310 | | |
311 | 20 | static std::string_view trim_json_number(std::string_view raw_number) { |
312 | 21 | while (!raw_number.empty() && is_json_number_space(raw_number.front())) { |
313 | 1 | raw_number.remove_prefix(1); |
314 | 1 | } |
315 | 21 | while (!raw_number.empty() && is_json_number_space(raw_number.back())) { |
316 | 1 | raw_number.remove_suffix(1); |
317 | 1 | } |
318 | 20 | return raw_number; |
319 | 20 | } |
320 | | |
321 | 1.08k | static bool is_json_number_digit(char c) { return c >= '0' && c <= '9'; } |
322 | | |
323 | 20 | static Status validate_json_number(std::string_view raw_number, bool& is_integer) { |
324 | 20 | if (raw_number.empty()) { |
325 | 0 | return Status::InvalidArgument("empty number"); |
326 | 0 | } |
327 | | |
328 | 20 | size_t pos = 0; |
329 | 20 | if (raw_number[pos] == '-') { |
330 | 0 | ++pos; |
331 | 0 | if (pos == raw_number.size()) { |
332 | 0 | return Status::InvalidArgument("invalid number, raw string is: " + |
333 | 0 | std::string(raw_number)); |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | 20 | if (raw_number[pos] == '0') { |
338 | 0 | ++pos; |
339 | 20 | } else if (raw_number[pos] >= '1' && raw_number[pos] <= '9') { |
340 | 1.08k | while (pos < raw_number.size() && is_json_number_digit(raw_number[pos])) { |
341 | 1.06k | ++pos; |
342 | 1.06k | } |
343 | 20 | } else { |
344 | 0 | return Status::InvalidArgument("invalid number, raw string is: " + |
345 | 0 | std::string(raw_number)); |
346 | 0 | } |
347 | | |
348 | 20 | bool has_fraction = false; |
349 | 20 | if (pos < raw_number.size() && raw_number[pos] == '.') { |
350 | 4 | has_fraction = true; |
351 | 4 | ++pos; |
352 | 4 | if (pos == raw_number.size() || !is_json_number_digit(raw_number[pos])) { |
353 | 3 | return Status::InvalidArgument("invalid number, raw string is: " + |
354 | 3 | std::string(raw_number)); |
355 | 3 | } |
356 | 3 | while (pos < raw_number.size() && is_json_number_digit(raw_number[pos])) { |
357 | 2 | ++pos; |
358 | 2 | } |
359 | 1 | } |
360 | | |
361 | 17 | bool has_exponent = false; |
362 | 17 | if (pos < raw_number.size() && (raw_number[pos] == 'e' || raw_number[pos] == 'E')) { |
363 | 3 | has_exponent = true; |
364 | 3 | ++pos; |
365 | 3 | if (pos < raw_number.size() && (raw_number[pos] == '+' || raw_number[pos] == '-')) { |
366 | 0 | ++pos; |
367 | 0 | } |
368 | 3 | if (pos == raw_number.size() || !is_json_number_digit(raw_number[pos])) { |
369 | 1 | return Status::InvalidArgument("invalid number, raw string is: " + |
370 | 1 | std::string(raw_number)); |
371 | 1 | } |
372 | 10 | while (pos < raw_number.size() && is_json_number_digit(raw_number[pos])) { |
373 | 8 | ++pos; |
374 | 8 | } |
375 | 2 | } |
376 | | |
377 | 16 | if (pos != raw_number.size()) { |
378 | 4 | return Status::InvalidArgument("simdjson parse exception: trailing content"); |
379 | 4 | } |
380 | 12 | is_integer = !has_fraction && !has_exponent; |
381 | 12 | return Status::OK(); |
382 | 16 | } |
383 | | |
384 | 109k | static Status write_int128(int128_t val, JsonbWriter& writer) { |
385 | 109k | bool success = false; |
386 | 109k | if (val >= std::numeric_limits<int8_t>::min() && |
387 | 109k | val <= std::numeric_limits<int8_t>::max()) { |
388 | 48.0k | success = writer.writeInt8((int8_t)val); |
389 | 61.1k | } else if (val >= std::numeric_limits<int16_t>::min() && |
390 | 61.1k | val <= std::numeric_limits<int16_t>::max()) { |
391 | 97 | success = writer.writeInt16((int16_t)val); |
392 | 61.0k | } else if (val >= std::numeric_limits<int32_t>::min() && |
393 | 61.0k | val <= std::numeric_limits<int32_t>::max()) { |
394 | 30 | success = writer.writeInt32((int32_t)val); |
395 | 60.9k | } else if (val >= std::numeric_limits<int64_t>::min() && |
396 | 60.9k | val <= std::numeric_limits<int64_t>::max()) { |
397 | 60.9k | success = writer.writeInt64((int64_t)val); |
398 | 60.9k | } else { // INT128 |
399 | 18 | success = writer.writeInt128(val); |
400 | 18 | } |
401 | | |
402 | 109k | if (!success) { |
403 | 0 | return Status::InvalidArgument("writeInt failed"); |
404 | 0 | } |
405 | 109k | return Status::OK(); |
406 | 109k | } |
407 | | |
408 | 16 | static Status write_number_from_raw_json(const char* pch, size_t len, JsonbWriter& writer) { |
409 | 16 | return write_number_from_raw_json(std::string_view(pch, len), writer); |
410 | 16 | } |
411 | | |
412 | | // According to https://github.com/simdjson/simdjson/pull/2139 |
413 | | // For numbers larger than 64 bits, we can obtain the raw_json_token and parse it ourselves. |
414 | | // This allows handling numbers larger than 64 bits, such as int128. |
415 | 20 | static Status write_number_from_raw_json(std::string_view raw_number, JsonbWriter& writer) { |
416 | 20 | raw_number = trim_json_number(raw_number); |
417 | 20 | bool is_integer = false; |
418 | 20 | RETURN_IF_ERROR(validate_json_number(raw_number, is_integer)); |
419 | | |
420 | 12 | StringParser::ParseResult result; |
421 | 12 | if (is_integer) { |
422 | 10 | auto val = StringParser::string_to_int<int128_t>(raw_number.data(), raw_number.size(), |
423 | 10 | &result); |
424 | 10 | if (result == StringParser::PARSE_SUCCESS) { |
425 | 6 | RETURN_IF_ERROR(write_int128(val, writer)); |
426 | 6 | return Status::OK(); |
427 | 6 | } |
428 | 10 | } |
429 | | |
430 | | // If the string exceeds the range of int128_t, it will attempt to convert it to double. |
431 | | // This may result in loss of precision, but for JSON, exchanging data as plain text |
432 | | // between different systems may inherently cause precision loss. |
433 | 6 | double double_val = StringParser::string_to_float<double>(raw_number.data(), |
434 | 6 | raw_number.size(), &result); |
435 | 6 | if (result != StringParser::PARSE_SUCCESS || !std::isfinite(double_val)) { |
436 | 4 | return Status::InvalidArgument("invalid number, raw string is: " + |
437 | 4 | std::string(raw_number)); |
438 | 4 | } |
439 | 2 | if (!writer.writeDouble(double_val)) { |
440 | 0 | return Status::InvalidArgument("writeDouble failed"); |
441 | 0 | } |
442 | 2 | return Status::OK(); |
443 | 2 | } |
444 | | }; |
445 | | } // namespace doris |