be/src/util/jsonb_parser_simd.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2014, Facebook, Inc. |
3 | | * All rights reserved. |
4 | | * |
5 | | * This source code is licensed under the BSD-style license found in the |
6 | | * LICENSE file in the root directory of this source tree. An additional grant |
7 | | * of patent rights can be found in the PATENTS file in the same directory. |
8 | | * |
9 | | */ |
10 | | |
11 | | /* |
12 | | * This file defines JsonbParserTSIMD (template) and JsonbParser. |
13 | | * |
14 | | * JsonbParserTSIMD is a template class which implements a JSON parser. |
15 | | * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format |
16 | | * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new |
17 | | * JsonbWriterT object with an output stream object. However, you can also |
18 | | * pass in your JsonbWriterT or any stream object that implements some basic |
19 | | * interface of std::ostream (see JsonbStream.h). |
20 | | * |
21 | | * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see |
22 | | * JsonbStream.h). So unless you want to provide own a different output stream |
23 | | * type, use JsonbParser object. |
24 | | * |
25 | | * ** Parsing JSON ** |
26 | | * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB |
27 | | * packed bytes. There are three ways to parse a JSON string: (1) using |
28 | | * c-string, (2) using string with len, (3) using std::istream object. You can |
29 | | * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used |
30 | | * internally if the input is raw character buffer. |
31 | | * |
32 | | * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON |
33 | | * strings, and the previous JSONB will be overwritten. |
34 | | * |
35 | | * If parsing fails (returned false), the error code will be set to one of |
36 | | * JsonbErrType, and can be retrieved by calling getErrorCode(). |
37 | | * |
38 | | * ** External dictionary ** |
39 | | * During parsing a JSON string, you can pass a call-back function to map a key |
40 | | * string to an id, and store the dictionary id in JSONB to save space. The |
41 | | * purpose of using an external dictionary is more towards a collection of |
42 | | * documents (which has common keys) rather than a single document, so that |
43 | | * space saving will be significant. |
44 | | * |
45 | | * ** Endianness ** |
46 | | * Note: JSONB serialization doesn't assume endianness of the server. However |
47 | | * you will need to ensure that the endianness at the reader side is the same |
48 | | * as that at the writer side (if they are on different machines). Otherwise, |
49 | | * proper conversion is needed when a number value is returned to the |
50 | | * caller/writer. |
51 | | * |
52 | | * @author Tian Xia <tianx@fb.com> |
53 | | * |
54 | | * this file is copied from |
55 | | * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h |
56 | | * and modified by Doris |
57 | | */ |
58 | | |
59 | | #pragma once |
60 | | #include <simdjson.h> |
61 | | |
62 | | #include <cmath> |
63 | | #include <limits> |
64 | | |
65 | | #include "common/status.h" |
66 | | #include "util/jsonb_document.h" |
67 | | #include "util/jsonb_writer.h" |
68 | | #include "util/string_parser.hpp" |
69 | | |
70 | | namespace doris { |
71 | | #include "common/compile_check_begin.h" |
72 | | using int128_t = __int128; |
73 | | struct JsonbParser { |
74 | | // According to https://github.com/simdjson/simdjson/pull/2139 |
75 | | // For numbers larger than 64 bits, we can obtain the raw_json_token and parse it ourselves. |
76 | | // This allows handling numbers larger than 64 bits, such as int128. |
77 | | // For example, try to parse a 18446744073709551616, this number is just 1 greater than the maximum value of uint64_t, and simdjson will return a NUMBER_ERROR |
78 | | // If try to parse a 18446744073709551616231231, it is obviously a large integer, at this time simdjson will return a BIGINT_ERROR |
79 | 344k | static bool parse_number_success(simdjson::error_code error_code) { |
80 | 344k | return error_code == simdjson::error_code::SUCCESS || |
81 | 344k | error_code == simdjson::error_code::NUMBER_ERROR || |
82 | 344k | error_code == simdjson::error_code::BIGINT_ERROR; |
83 | 344k | } |
84 | | |
85 | | // parse a UTF-8 JSON string with length |
86 | | // will reset writer before parse |
87 | 59.1k | static Status parse(const char* pch, size_t len, JsonbWriter& writer) { |
88 | 59.1k | if (!pch || len == 0) { |
89 | 21 | return Status::InvalidArgument("Empty JSON document"); |
90 | 21 | } |
91 | 59.1k | writer.reset(); |
92 | 59.1k | try { |
93 | 59.1k | simdjson::ondemand::parser simdjson_parser; |
94 | 59.1k | simdjson::padded_string json_str {pch, len}; |
95 | 59.1k | simdjson::ondemand::document doc = simdjson_parser.iterate(json_str); |
96 | | |
97 | | // simdjson process top level primitive types specially |
98 | | // so some repeated code here |
99 | 59.1k | switch (doc.type()) { |
100 | 45.5k | case simdjson::ondemand::json_type::object: |
101 | 56.9k | case simdjson::ondemand::json_type::array: { |
102 | 56.9k | RETURN_IF_ERROR(parse(doc.get_value(), writer)); |
103 | 56.6k | break; |
104 | 56.9k | } |
105 | 56.6k | case simdjson::ondemand::json_type::null: { |
106 | 122 | if (writer.writeNull() == 0) { |
107 | 0 | return Status::InvalidArgument("writeNull failed"); |
108 | 0 | } |
109 | 122 | break; |
110 | 122 | } |
111 | 135 | case simdjson::ondemand::json_type::boolean: { |
112 | 135 | if (writer.writeBool(doc.get_bool()) == 0) { |
113 | 0 | return Status::InvalidArgument("writeBool failed"); |
114 | 0 | } |
115 | 135 | break; |
116 | 135 | } |
117 | 335 | case simdjson::ondemand::json_type::string: { |
118 | 335 | RETURN_IF_ERROR(write_string(doc.get_string(), writer)); |
119 | 335 | break; |
120 | 335 | } |
121 | 545 | case simdjson::ondemand::json_type::number: { |
122 | 545 | simdjson::ondemand::number num; |
123 | 545 | simdjson::error_code res = doc.get_number().get(num); |
124 | 545 | if (!parse_number_success(res)) { |
125 | 0 | return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}", |
126 | 0 | simdjson::error_message(res))); |
127 | 0 | } |
128 | | // simdjson get_number() returns a number object, which can be |
129 | 545 | RETURN_IF_ERROR( |
130 | 545 | write_number(num, doc.get_number_type(), doc.raw_json_token(), writer)); |
131 | 487 | break; |
132 | 545 | } |
133 | 59.1k | } |
134 | 59.1k | } catch (simdjson::simdjson_error& e) { |
135 | 1.14k | return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what())); |
136 | 1.14k | } |
137 | 57.6k | return Status::OK(); |
138 | 59.1k | } |
139 | | |
140 | | private: |
141 | | // parse json, recursively if necessary, by simdjson |
142 | | // and serialize to binary format by writer |
143 | 901k | static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) { |
144 | 901k | switch (value.type()) { |
145 | 11.9k | case simdjson::ondemand::json_type::null: { |
146 | 11.9k | if (writer.writeNull() == 0) { |
147 | 0 | return Status::InvalidArgument("writeNull failed"); |
148 | 0 | } |
149 | 11.9k | break; |
150 | 11.9k | } |
151 | 21.3k | case simdjson::ondemand::json_type::boolean: { |
152 | 21.3k | if (writer.writeBool(value.get_bool()) == 0) { |
153 | 0 | return Status::InvalidArgument("writeBool failed"); |
154 | 0 | } |
155 | 21.3k | break; |
156 | 21.3k | } |
157 | 385k | case simdjson::ondemand::json_type::string: { |
158 | 385k | RETURN_IF_ERROR(write_string(value.get_string(), writer)); |
159 | 385k | break; |
160 | 385k | } |
161 | 385k | case simdjson::ondemand::json_type::number: { |
162 | 343k | simdjson::ondemand::number num; |
163 | 343k | auto res = value.get_number().get(num); |
164 | 343k | if (!parse_number_success(res)) { |
165 | 0 | return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}", |
166 | 0 | simdjson::error_message(res))); |
167 | 0 | } |
168 | | |
169 | 343k | RETURN_IF_ERROR( |
170 | 343k | write_number(num, value.get_number_type(), value.raw_json_token(), writer)); |
171 | 343k | break; |
172 | 343k | } |
173 | 343k | case simdjson::ondemand::json_type::object: { |
174 | 64.9k | if (!writer.writeStartObject()) { |
175 | 0 | return Status::InvalidArgument("writeStartObject failed"); |
176 | 0 | } |
177 | | |
178 | 209k | for (auto kv : value.get_object()) { |
179 | 209k | std::string_view key; |
180 | 209k | simdjson::error_code e = kv.unescaped_key().get(key); |
181 | 209k | if (e != simdjson::SUCCESS) { |
182 | 41 | return Status::InvalidArgument(fmt::format("simdjson get key failed: {}", e)); |
183 | 41 | } |
184 | | |
185 | | // write key |
186 | 209k | if (key.size() > std::numeric_limits<uint8_t>::max()) { |
187 | 268 | return Status::InvalidArgument("key size exceeds max limit: {} , {}", |
188 | 268 | key.size(), std::numeric_limits<uint8_t>::max()); |
189 | 268 | } |
190 | 208k | if (!writer.writeKey(key.data(), (uint8_t)key.size())) { |
191 | 0 | return Status::InvalidArgument("writeKey failed : {}", key); |
192 | 0 | } |
193 | | |
194 | | // parse object value |
195 | 208k | RETURN_IF_ERROR(parse(kv.value(), writer)); |
196 | 208k | } |
197 | | |
198 | 64.5k | if (!writer.writeEndObject()) { |
199 | 0 | return Status::InvalidArgument("writeEndObject failed"); |
200 | 0 | break; |
201 | 0 | } |
202 | | |
203 | 64.5k | break; |
204 | 64.5k | } |
205 | 74.2k | case simdjson::ondemand::json_type::array: { |
206 | 74.2k | if (!writer.writeStartArray()) { |
207 | 0 | return Status::InvalidArgument("writeStartArray failed"); |
208 | 0 | } |
209 | | |
210 | 636k | for (auto elem : value.get_array()) { |
211 | | // parse array element |
212 | 636k | RETURN_IF_ERROR(parse(elem.value(), writer)); |
213 | 636k | } |
214 | | |
215 | 74.0k | if (!writer.writeEndArray()) { |
216 | 0 | return Status::InvalidArgument("writeEndArray failed"); |
217 | 0 | } |
218 | 74.0k | break; |
219 | 74.0k | } |
220 | 74.0k | default: { |
221 | 0 | return Status::InvalidArgument("unknown value type: "); |
222 | 74.0k | } |
223 | | |
224 | 901k | } // end of switch |
225 | 901k | return Status::OK(); |
226 | 901k | } |
227 | | |
228 | 386k | static Status write_string(std::string_view str, JsonbWriter& writer) { |
229 | | // start writing string |
230 | 386k | if (!writer.writeStartString()) { |
231 | 0 | return Status::InvalidArgument("writeStartString failed"); |
232 | 0 | } |
233 | | |
234 | | // write string |
235 | 386k | if (str.size() > 0) { |
236 | 374k | if (writer.writeString(str.data(), str.size()) == 0) { |
237 | 0 | return Status::InvalidArgument("writeString failed"); |
238 | 0 | } |
239 | 374k | } |
240 | | |
241 | | // end writing string |
242 | 386k | if (!writer.writeEndString()) { |
243 | 0 | return Status::InvalidArgument("writeEndString failed"); |
244 | 0 | } |
245 | 386k | return Status::OK(); |
246 | 386k | } |
247 | | |
248 | | static Status write_number(simdjson::ondemand::number num, |
249 | | simdjson ::ondemand::number_type num_type, |
250 | 344k | std::string_view raw_string, JsonbWriter& writer) { |
251 | | // The simdjson library supports four types of numbers: |
252 | | // 1. floating_point_number: A binary64 number, which will be converted to jsonb's double type. |
253 | | // 2. signed_integer: A signed integer that fits in a 64-bit word using two's complement. |
254 | | // 3. unsigned_integer: A positive integer larger or equal to 1<<63. |
255 | | // For these two integer types, we will convert them to jsonb's int8/int16/int32/int64/int128 types according to the specific value. |
256 | | // 4. big_integer: An integer that does not fit in a 64-bit word. |
257 | | // For this type, simdjson cannot handle it directly. We first try to convert it to jsonb's int128 type. |
258 | | // If conversion fails, we attempt to convert it to a double type. |
259 | | // If conversion to double also fails, an error is returned. |
260 | | |
261 | 344k | switch (num_type) { |
262 | 140k | case simdjson::ondemand::number_type::floating_point_number: { |
263 | 140k | double number = num.get_double(); |
264 | | // When a double exceeds the precision that can be represented by a double type in simdjson, it gets converted to 0. |
265 | | // The correct approach, should be to truncate the double value instead. |
266 | 140k | if (number == 0) { |
267 | 13.7k | StringParser::ParseResult result; |
268 | 13.7k | number = StringParser::string_to_float<double>(raw_string.data(), raw_string.size(), |
269 | 13.7k | &result); |
270 | 13.7k | if (result != StringParser::PARSE_SUCCESS) { |
271 | 59 | return Status::InvalidArgument("invalid number, raw string is: " + |
272 | 59 | std::string(raw_string)); |
273 | 59 | } |
274 | 13.7k | } |
275 | | |
276 | 140k | if (writer.writeDouble(number) == 0) { |
277 | 0 | return Status::InvalidArgument("writeDouble failed"); |
278 | 0 | } |
279 | | |
280 | 140k | break; |
281 | 140k | } |
282 | 204k | case simdjson::ondemand::number_type::signed_integer: |
283 | 204k | case simdjson::ondemand::number_type::unsigned_integer: { |
284 | 204k | int128_t val = num.is_int64() ? (int128_t)num.get_int64() : (int128_t)num.get_uint64(); |
285 | 204k | bool success = false; |
286 | 204k | if (val >= std::numeric_limits<int8_t>::min() && |
287 | 204k | val <= std::numeric_limits<int8_t>::max()) { |
288 | 76.4k | success = writer.writeInt8((int8_t)val); |
289 | 127k | } else if (val >= std::numeric_limits<int16_t>::min() && |
290 | 127k | val <= std::numeric_limits<int16_t>::max()) { |
291 | 50.8k | success = writer.writeInt16((int16_t)val); |
292 | 76.8k | } else if (val >= std::numeric_limits<int32_t>::min() && |
293 | 76.8k | val <= std::numeric_limits<int32_t>::max()) { |
294 | 8.44k | success = writer.writeInt32((int32_t)val); |
295 | 68.3k | } else if (val >= std::numeric_limits<int64_t>::min() && |
296 | 68.4k | val <= std::numeric_limits<int64_t>::max()) { |
297 | 68.4k | success = writer.writeInt64((int64_t)val); |
298 | 18.4E | } else { // INT128 |
299 | 18.4E | success = writer.writeInt128(val); |
300 | 18.4E | } |
301 | | |
302 | 204k | if (!success) { |
303 | 0 | return Status::InvalidArgument("writeInt failed"); |
304 | 0 | } |
305 | 204k | break; |
306 | 204k | } |
307 | 204k | case simdjson::ondemand::number_type::big_integer: { |
308 | 12 | StringParser::ParseResult result; |
309 | 12 | auto val = StringParser::string_to_int<int128_t>(raw_string.data(), raw_string.size(), |
310 | 12 | &result); |
311 | 12 | if (result != StringParser::PARSE_SUCCESS) { |
312 | | // If the string exceeds the range of int128_t, it will attempt to convert it to double. |
313 | | // This may result in loss of precision, but for JSON, exchanging data as plain text between different systems may inherently cause precision loss. |
314 | | // try parse as double |
315 | 2 | double double_val = StringParser::string_to_float<double>( |
316 | 2 | raw_string.data(), raw_string.size(), &result); |
317 | 2 | if (result != StringParser::PARSE_SUCCESS) { |
318 | | // if both parse failed, return error |
319 | 0 | return Status::InvalidArgument("invalid number, raw string is: " + |
320 | 0 | std::string(raw_string)); |
321 | 0 | } |
322 | 2 | if (!writer.writeDouble(double_val)) { |
323 | 0 | return Status::InvalidArgument("writeDouble failed"); |
324 | 0 | } |
325 | 10 | } else { |
326 | | // as int128_t |
327 | 10 | if (!writer.writeInt128(val)) { |
328 | 0 | return Status::InvalidArgument("writeInt128 failed"); |
329 | 0 | } |
330 | 10 | } |
331 | 12 | break; |
332 | 12 | } |
333 | 344k | } |
334 | 344k | return Status::OK(); |
335 | 344k | } |
336 | | }; |
337 | | #include "common/compile_check_end.h" |
338 | | } // namespace doris |