/root/doris/be/src/util/jsonb_parser.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2014, Facebook, Inc. |
3 | | * All rights reserved. |
4 | | * |
5 | | * This source code is licensed under the BSD-style license found in the |
6 | | * LICENSE file in the root directory of this source tree. An additional grant |
7 | | * of patent rights can be found in the PATENTS file in the same directory. |
8 | | * |
9 | | */ |
10 | | |
11 | | /* |
12 | | * This file defines JsonbParserT (template) and JsonbParser. |
13 | | * |
14 | | * JsonbParserT is a template class which implements a JSON parser. |
15 | | * JsonbParserT parses JSON text, and serialize it to JSONB binary format |
16 | | * by using JsonbWriterT object. By default, JsonbParserT creates a new |
17 | | * JsonbWriterT object with an output stream object. However, you can also |
18 | | * pass in your JsonbWriterT or any stream object that implements some basic |
19 | | * interface of std::ostream (see JsonbStream.h). |
20 | | * |
21 | | * JsonbParser specializes JsonbParserT with JsonbOutStream type (see |
22 | | * JsonbStream.h). So unless you want to provide own a different output stream |
23 | | * type, use JsonbParser object. |
24 | | * |
25 | | * ** Parsing JSON ** |
26 | | * JsonbParserT parses JSON string, and directly serializes into JSONB |
27 | | * packed bytes. There are three ways to parse a JSON string: (1) using |
28 | | * c-string, (2) using string with len, (3) using std::istream object. You can |
29 | | * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used |
30 | | * internally if the input is raw character buffer. |
31 | | * |
32 | | * You can reuse an JsonbParserT object to parse/serialize multiple JSON |
33 | | * strings, and the previous JSONB will be overwritten. |
34 | | * |
35 | | * If parsing fails (returned false), the error code will be set to one of |
36 | | * JsonbErrType, and can be retrieved by calling getErrorCode(). |
37 | | * |
38 | | * ** External dictionary ** |
39 | | * During parsing a JSON string, you can pass a call-back function to map a key |
40 | | * string to an id, and store the dictionary id in JSONB to save space. The |
41 | | * purpose of using an external dictionary is more towards a collection of |
42 | | * documents (which has common keys) rather than a single document, so that |
43 | | * space saving will be significant. |
44 | | * |
45 | | * ** Endianness ** |
46 | | * Note: JSONB serialization doesn't assume endianness of the server. However |
47 | | * you will need to ensure that the endianness at the reader side is the same |
48 | | * as that at the writer side (if they are on different machines). Otherwise, |
49 | | * proper conversion is needed when a number value is returned to the |
50 | | * caller/writer. |
51 | | * |
52 | | * @author Tian Xia <tianx@fb.com> |
53 | | * |
54 | | * this file is copied from |
55 | | * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h |
56 | | * and modified by Doris |
57 | | */ |
58 | | |
59 | | #ifndef JSONB_JSONBJSONPARSER_H |
60 | | #define JSONB_JSONBJSONPARSER_H |
61 | | |
62 | | #include <cmath> |
63 | | #include <limits> |
64 | | |
65 | | #include "jsonb_document.h" |
66 | | #include "jsonb_error.h" |
67 | | #include "jsonb_writer.h" |
68 | | #include "string_parser.hpp" |
69 | | |
70 | | namespace doris { |
71 | | |
72 | | const char* const kJsonDelim = " ,]}\t\r\n"; |
73 | | const char* const kWhiteSpace = " \t\n\r"; |
74 | | |
75 | | /* |
76 | | * Template JsonbParserT |
77 | | */ |
78 | | template <class OS_TYPE> |
79 | | class JsonbParserT { |
80 | | public: |
81 | 991 | JsonbParserT() : stream_pos_(0), err_(JsonbErrType::E_NONE) {} |
82 | | |
83 | | explicit JsonbParserT(OS_TYPE& os) : writer_(os), stream_pos_(0), err_(JsonbErrType::E_NONE) {} |
84 | | |
85 | | // parse a UTF-8 JSON string |
86 | | bool parse(const std::string& str, hDictInsert handler = nullptr) { |
87 | | return parse(str.c_str(), str.size(), handler); |
88 | | } |
89 | | |
90 | | // parse a UTF-8 JSON c-style string (NULL terminated) |
91 | | bool parse(const char* c_str, hDictInsert handler = nullptr) { |
92 | | return parse(c_str, strlen(c_str), handler); |
93 | | } |
94 | | |
95 | | // parse a UTF-8 JSON string with length |
96 | 1.19k | bool parse(const char* pch, size_t len, hDictInsert handler = nullptr) { |
97 | 1.19k | if (!pch || len == 0) { |
98 | 0 | err_ = JsonbErrType::E_EMPTY_DOCUMENT; |
99 | 0 | return false; |
100 | 0 | } |
101 | | |
102 | 1.19k | JsonbInBuffer sb(pch, len); |
103 | 1.19k | std::istream in(&sb); |
104 | 1.19k | return parse(in, handler); |
105 | 1.19k | } |
106 | | |
107 | | // parse UTF-8 JSON text from an input stream |
108 | 1.19k | bool parse(std::istream& in, hDictInsert handler = nullptr) { |
109 | 1.19k | bool res = false; |
110 | 1.19k | err_ = JsonbErrType::E_NONE; |
111 | 1.19k | stream_pos_ = 0; |
112 | | |
113 | | // reset output stream |
114 | 1.19k | writer_.reset(); |
115 | | |
116 | 1.19k | trim(in); |
117 | | |
118 | | // TODO(wzy): parsePrimitive should be implemented |
119 | 1.19k | if (in.peek() == '{') { |
120 | 140 | skipChar(in); |
121 | 140 | res = parseObject(in, handler); |
122 | 1.05k | } else if (in.peek() == '[') { |
123 | 426 | skipChar(in); |
124 | 426 | res = parseArray(in, handler); |
125 | 627 | } else { |
126 | 627 | res = parsePrimitive(in, handler); |
127 | 627 | if (!res) err_ = handle_parse_failure(in); |
128 | 627 | } |
129 | | |
130 | 1.19k | trim(in); |
131 | 1.19k | if (res && !in.eof()) { |
132 | 0 | err_ = JsonbErrType::E_INVALID_DOCU; |
133 | 0 | return false; |
134 | 0 | } |
135 | | |
136 | 1.19k | return res; |
137 | 1.19k | } |
138 | | |
139 | 2.25k | JsonbWriterT<OS_TYPE>& getWriter() { return writer_; } |
140 | | |
141 | 68 | JsonbErrType getErrorCode() { return err_; } |
142 | | |
143 | | JsonbErrInfo getErrorInfo() { |
144 | | assert(err_ < JsonbErrType::E_NUM_ERRORS); |
145 | | |
146 | | JsonbErrInfo err_info; |
147 | | |
148 | | // stream_pos_ always points to the next char, so err_pos is 1-based |
149 | | err_info.err_pos = stream_pos_; |
150 | | err_info.err_msg = JsonbErrMsg::getErrMsg(err_); |
151 | | |
152 | | return err_info; |
153 | | } |
154 | | |
155 | | // clear error code |
156 | | void clearErr() { err_ = JsonbErrType::E_NONE; } |
157 | | |
158 | | private: |
159 | 0 | JsonbErrType handle_parse_value_failure(bool parse_res, std::istream& in) { |
160 | 0 | if (parse_res) { |
161 | 0 | trim(in); |
162 | 0 | if (!in.good()) { |
163 | 0 | return JsonbErrType::E_INVALID_DOCU_COMPAT; |
164 | 0 | } |
165 | 0 | } |
166 | 0 | return JsonbErrType::E_INVALID_DOCU; |
167 | 0 | ; |
168 | 0 | } |
169 | | |
170 | | // In case json is determined to be invalid at top level, |
171 | | // try to parse literal values. |
172 | | // We return a different error code E_INVALID_DOCU_COMPAT |
173 | | // in case the input json contains these values. |
174 | | // Returning a different error code will cause an |
175 | | // auditing on the caller. |
176 | | // This is mainly done because 8.0 JSON_VALID considers |
177 | | // this as a valid input. |
178 | 46 | JsonbErrType handle_parse_failure(std::istream& in) { |
179 | 46 | JsonbErrType error = JsonbErrType::E_INVALID_DOCU; |
180 | 46 | if (!writer_.writeStartArray()) { |
181 | 0 | return error; |
182 | 0 | } |
183 | | |
184 | 46 | switch (in.peek()) { |
185 | 0 | case 'n': |
186 | 0 | skipChar(in); |
187 | 0 | error = handle_parse_value_failure(parseNull(in), in); |
188 | 0 | break; |
189 | 0 | case 't': |
190 | 0 | skipChar(in); |
191 | 0 | error = handle_parse_value_failure(parseTrue(in), in); |
192 | 0 | break; |
193 | 0 | case 'f': |
194 | 0 | skipChar(in); |
195 | 0 | error = handle_parse_value_failure(parseFalse(in), in); |
196 | 0 | break; |
197 | 0 | case '"': |
198 | 0 | skipChar(in); |
199 | 0 | error = handle_parse_value_failure(parseString(in), in); |
200 | 0 | break; |
201 | 46 | default: |
202 | 46 | if (parseNumber(in)) { |
203 | 0 | trim(in); |
204 | 0 | if (in.eof()) { |
205 | 0 | error = JsonbErrType::E_INVALID_DOCU_COMPAT; |
206 | 0 | } |
207 | 0 | } |
208 | 46 | } |
209 | 46 | if (!writer_.writeEndArray()) { |
210 | 0 | return error; |
211 | 0 | } |
212 | | |
213 | 46 | return error; |
214 | 46 | } |
215 | | |
216 | | // parse primitive |
217 | 627 | bool parsePrimitive(std::istream& in, hDictInsert handler) { |
218 | 627 | bool res = false; |
219 | 627 | switch (in.peek()) { |
220 | 63 | case 'n': |
221 | 63 | skipChar(in); |
222 | 63 | res = parseNull(in); |
223 | 63 | break; |
224 | 63 | case 't': |
225 | 63 | skipChar(in); |
226 | 63 | res = parseTrue(in); |
227 | 63 | break; |
228 | 63 | case 'f': |
229 | 63 | skipChar(in); |
230 | 63 | res = parseFalse(in); |
231 | 63 | break; |
232 | 71 | case '"': |
233 | 71 | skipChar(in); |
234 | 71 | res = parseString(in); |
235 | 71 | break; |
236 | 367 | default: |
237 | 367 | res = parseNumber(in); |
238 | 627 | } |
239 | | |
240 | 627 | return res; |
241 | 627 | } |
242 | | |
243 | | // parse a JSON object (comma-separated list of key-value pairs) |
244 | 236 | bool parseObject(std::istream& in, hDictInsert handler) { |
245 | 236 | if (!writer_.writeStartObject()) { |
246 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
247 | 0 | return false; |
248 | 0 | } |
249 | | |
250 | 236 | trim(in); |
251 | | |
252 | 236 | if (in.peek() == '}') { |
253 | 66 | skipChar(in); |
254 | | // empty object |
255 | 66 | if (!writer_.writeEndObject()) { |
256 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
257 | 0 | return false; |
258 | 0 | } |
259 | 66 | return true; |
260 | 66 | } |
261 | | |
262 | 329 | while (in.good()) { |
263 | 329 | if (nextChar(in) != '"') { |
264 | 11 | err_ = JsonbErrType::E_INVALID_OBJ; |
265 | 11 | return false; |
266 | 11 | } |
267 | | |
268 | 318 | if (!parseKVPair(in, handler)) { |
269 | 0 | return false; |
270 | 0 | } |
271 | | |
272 | 318 | trim(in); |
273 | | |
274 | 318 | char ch = nextChar(in); |
275 | 318 | if (ch == '}') { |
276 | | // end of the object |
277 | 159 | if (!writer_.writeEndObject()) { |
278 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
279 | 0 | return false; |
280 | 0 | } |
281 | 159 | return true; |
282 | 159 | } else if (ch != ',') { |
283 | 0 | err_ = JsonbErrType::E_INVALID_OBJ; |
284 | 0 | return false; |
285 | 0 | } |
286 | | |
287 | 159 | trim(in); |
288 | 159 | } |
289 | | |
290 | 0 | err_ = JsonbErrType::E_INVALID_OBJ; |
291 | 0 | return false; |
292 | 170 | } |
293 | | |
294 | | // parse a JSON array (comma-separated list of values) |
295 | 426 | bool parseArray(std::istream& in, hDictInsert handler) { |
296 | 426 | if (!writer_.writeStartArray()) { |
297 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
298 | 0 | return false; |
299 | 0 | } |
300 | | |
301 | 426 | trim(in); |
302 | | |
303 | 426 | if (in.peek() == ']') { |
304 | 63 | skipChar(in); |
305 | | // empty array |
306 | 63 | if (!writer_.writeEndArray()) { |
307 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
308 | 0 | return false; |
309 | 0 | } |
310 | 63 | return true; |
311 | 63 | } |
312 | | |
313 | 1.35k | while (in.good()) { |
314 | 1.35k | if (!parseValue(in, handler)) { |
315 | 11 | return false; |
316 | 11 | } |
317 | | |
318 | 1.34k | trim(in); |
319 | | |
320 | 1.34k | char ch = nextChar(in); |
321 | 1.34k | if (ch == ']') { |
322 | | // end of the array |
323 | 352 | if (!writer_.writeEndArray()) { |
324 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
325 | 0 | return false; |
326 | 0 | } |
327 | 352 | return true; |
328 | 991 | } else if (ch != ',') { |
329 | 0 | err_ = JsonbErrType::E_INVALID_ARR; |
330 | 0 | return false; |
331 | 0 | } |
332 | | |
333 | 991 | trim(in); |
334 | 991 | } |
335 | | |
336 | 0 | err_ = JsonbErrType::E_INVALID_ARR; |
337 | 0 | return false; |
338 | 363 | } |
339 | | |
340 | | // parse a key-value pair, separated by ":" |
341 | 318 | bool parseKVPair(std::istream& in, hDictInsert handler) { |
342 | 318 | if (parseKey(in, handler) && parseValue(in, handler)) { |
343 | 318 | return true; |
344 | 318 | } |
345 | | |
346 | 0 | return false; |
347 | 318 | } |
348 | | |
349 | | // parse a key (must be string) |
350 | 318 | bool parseKey(std::istream& in, hDictInsert handler) { |
351 | 318 | char key[JsonbKeyValue::sMaxKeyLen]; |
352 | 318 | int key_len = 0; |
353 | 954 | while (in.good() && in.peek() != '"' && key_len < JsonbKeyValue::sMaxKeyLen) { |
354 | 636 | char ch = nextChar(in); |
355 | 636 | if (ch == '\\') { |
356 | 0 | char escape_buffer[5]; // buffer for escape |
357 | 0 | int len; |
358 | 0 | if (!parseEscape(in, escape_buffer, len)) { |
359 | 0 | err_ = JsonbErrType::E_INVALID_KEY_STRING; |
360 | 0 | return false; |
361 | 0 | } |
362 | 0 | if (key_len + len >= JsonbKeyValue::sMaxKeyLen) { |
363 | 0 | err_ = JsonbErrType::E_INVALID_KEY_LENGTH; |
364 | 0 | return false; |
365 | 0 | } |
366 | 0 | memcpy(key + key_len, escape_buffer, len); |
367 | 0 | key_len += len; |
368 | 636 | } else { |
369 | 636 | key[key_len++] = ch; |
370 | 636 | } |
371 | 636 | } |
372 | | // The JSON key can be an empty string. |
373 | 318 | if (!in.good() || in.peek() != '"') { |
374 | 0 | if (key_len == JsonbKeyValue::sMaxKeyLen) |
375 | 0 | err_ = JsonbErrType::E_INVALID_KEY_LENGTH; |
376 | 0 | else |
377 | 0 | err_ = JsonbErrType::E_INVALID_KEY_STRING; |
378 | 0 | return false; |
379 | 0 | } |
380 | | |
381 | 318 | skipChar(in); // discard '"' |
382 | | |
383 | 318 | int key_id = -1; |
384 | 318 | if (handler) { |
385 | 0 | key_id = handler(key, key_len); |
386 | 0 | } |
387 | | |
388 | 318 | if (key_id < 0) { |
389 | 318 | writer_.writeKey(key, key_len); |
390 | 318 | } else { |
391 | 0 | writer_.writeKey(key_id); |
392 | 0 | } |
393 | | |
394 | 318 | trim(in); |
395 | | |
396 | 318 | if (nextChar(in) != ':') { |
397 | 0 | err_ = JsonbErrType::E_INVALID_OBJ; |
398 | 0 | return false; |
399 | 0 | } |
400 | | |
401 | 318 | trim(in); |
402 | 318 | if (!in.good()) { |
403 | 0 | err_ = JsonbErrType::E_INVALID_OBJ; |
404 | 0 | return false; |
405 | 0 | } |
406 | | |
407 | 318 | return true; |
408 | 318 | } |
409 | | |
410 | | // parse a value |
411 | 1.67k | bool parseValue(std::istream& in, hDictInsert handler) { |
412 | 1.67k | bool res = false; |
413 | | |
414 | 1.67k | switch (in.peek()) { |
415 | 0 | case 'N': |
416 | 109 | case 'n': { |
417 | 109 | skipChar(in); |
418 | 109 | res = parseNull(in); |
419 | 109 | break; |
420 | 0 | } |
421 | 0 | case 'T': |
422 | 109 | case 't': { |
423 | 109 | skipChar(in); |
424 | 109 | res = parseTrue(in); |
425 | 109 | break; |
426 | 0 | } |
427 | 0 | case 'F': |
428 | 109 | case 'f': { |
429 | 109 | skipChar(in); |
430 | 109 | res = parseFalse(in); |
431 | 109 | break; |
432 | 0 | } |
433 | 511 | case '"': { |
434 | 511 | skipChar(in); |
435 | 511 | res = parseString(in); |
436 | 511 | break; |
437 | 0 | } |
438 | 96 | case '{': { |
439 | 96 | skipChar(in); |
440 | 96 | ++nesting_lvl_; |
441 | 96 | if (nesting_lvl_ >= MaxNestingLevel) { |
442 | 0 | err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW; |
443 | 0 | return false; |
444 | 0 | } |
445 | 96 | res = parseObject(in, handler); |
446 | 96 | if (res) { |
447 | 96 | --nesting_lvl_; |
448 | 96 | } |
449 | 96 | break; |
450 | 96 | } |
451 | 0 | case '[': { |
452 | 0 | skipChar(in); |
453 | 0 | ++nesting_lvl_; |
454 | 0 | if (nesting_lvl_ >= MaxNestingLevel) { |
455 | 0 | err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW; |
456 | 0 | return false; |
457 | 0 | } |
458 | 0 | res = parseArray(in, handler); |
459 | 0 | if (res) { |
460 | 0 | --nesting_lvl_; |
461 | 0 | } |
462 | 0 | break; |
463 | 0 | } |
464 | 738 | default: { |
465 | 738 | res = parseNumber(in); |
466 | 738 | break; |
467 | 0 | } |
468 | 1.67k | } |
469 | | |
470 | 1.67k | return res; |
471 | 1.67k | } |
472 | | |
473 | | // parse NULL value |
474 | 172 | bool parseNull(std::istream& in) { |
475 | 172 | if (tolower(nextChar(in)) == 'u' && tolower(nextChar(in)) == 'l' && |
476 | 172 | tolower(nextChar(in)) == 'l') { |
477 | 172 | writer_.writeNull(); |
478 | 172 | return true; |
479 | 172 | } |
480 | | |
481 | 0 | err_ = JsonbErrType::E_INVALID_SCALAR_VALUE; |
482 | 0 | return false; |
483 | 172 | } |
484 | | |
485 | | // parse TRUE value |
486 | 172 | bool parseTrue(std::istream& in) { |
487 | 172 | if (tolower(nextChar(in)) == 'r' && tolower(nextChar(in)) == 'u' && |
488 | 172 | tolower(nextChar(in)) == 'e') { |
489 | 172 | writer_.writeBool(true); |
490 | 172 | return true; |
491 | 172 | } |
492 | | |
493 | 0 | err_ = JsonbErrType::E_INVALID_SCALAR_VALUE; |
494 | 0 | return false; |
495 | 172 | } |
496 | | |
497 | | // parse FALSE value |
498 | 172 | bool parseFalse(std::istream& in) { |
499 | 172 | if (tolower(nextChar(in)) == 'a' && tolower(nextChar(in)) == 'l' && |
500 | 172 | tolower(nextChar(in)) == 's' && tolower(nextChar(in)) == 'e') { |
501 | 172 | writer_.writeBool(false); |
502 | 172 | return true; |
503 | 172 | } |
504 | | |
505 | 0 | err_ = JsonbErrType::E_INVALID_SCALAR_VALUE; |
506 | 0 | return false; |
507 | 172 | } |
508 | | |
509 | | /* |
510 | | This is a helper function to parse the hex value. hex_num means the |
511 | | number of digits needed to be parsed. If less than zero, then it will |
512 | | consider all the characters between current and any character in JsonDelim. |
513 | | */ |
514 | 0 | unsigned parseHexHelper(std::istream& in, uint64_t& val, unsigned hex_num = 17) { |
515 | | // We can't read more than 17 digits, so when read 17 digits, it's overflow |
516 | 0 | val = 0; |
517 | 0 | unsigned num_digits = 0; |
518 | 0 | char ch = tolower(in.peek()); |
519 | 0 | while (in.good() && !strchr(kJsonDelim, ch) && num_digits != hex_num) { |
520 | 0 | if (ch >= '0' && ch <= '9') { |
521 | 0 | val = (val << 4) + (ch - '0'); |
522 | 0 | } else if (ch >= 'a' && ch <= 'f') { |
523 | 0 | val = (val << 4) + (ch - 'a' + 10); |
524 | 0 | } else { |
525 | | // unrecognized hex digit |
526 | 0 | return 0; |
527 | 0 | } |
528 | 0 | skipChar(in); |
529 | 0 | ch = tolower(in.peek()); |
530 | 0 | ++num_digits; |
531 | 0 | } |
532 | 0 | return num_digits; |
533 | 0 | } |
534 | | |
535 | | // parse HEX value |
536 | 0 | bool parseHex4(std::istream& in, unsigned& h) { |
537 | 0 | uint64_t val; |
538 | 0 | if (4 == parseHexHelper(in, val, 4)) { |
539 | 0 | h = (unsigned)val; |
540 | 0 | return true; |
541 | 0 | } |
542 | 0 | return false; |
543 | 0 | } |
544 | | |
545 | | /* |
546 | | parse Escape char. |
547 | | */ |
548 | 0 | bool parseEscape(std::istream& in, char* out, int& len) { |
549 | | /* |
550 | | This is extracted from cJSON implementation. |
551 | | This is about the mask of the first byte in UTF-8. |
552 | | The mask is defined in: |
553 | | http://en.wikipedia.org/wiki/UTF-8#Description |
554 | | */ |
555 | 0 | const unsigned char firstByteMark[6] = {0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; |
556 | 0 | if (!in.good()) { |
557 | 0 | return false; |
558 | 0 | } |
559 | 0 | char c = nextChar(in); |
560 | 0 | len = 1; |
561 | 0 | switch (c) { |
562 | | // \" \\ \/ \b \f \n \r \t |
563 | 0 | case '"': |
564 | 0 | *out = '"'; |
565 | 0 | return true; |
566 | 0 | case '\\': |
567 | 0 | *out = '\\'; |
568 | 0 | return true; |
569 | 0 | case '/': |
570 | 0 | *out = '/'; |
571 | 0 | return true; |
572 | 0 | case 'b': |
573 | 0 | *out = '\b'; |
574 | 0 | return true; |
575 | 0 | case 'f': |
576 | 0 | *out = '\f'; |
577 | 0 | return true; |
578 | 0 | case 'n': |
579 | 0 | *out = '\n'; |
580 | 0 | return true; |
581 | 0 | case 'r': |
582 | 0 | *out = '\r'; |
583 | 0 | return true; |
584 | 0 | case 't': |
585 | 0 | *out = '\t'; |
586 | 0 | return true; |
587 | 0 | case 'u': { |
588 | 0 | unsigned uc; |
589 | 0 | if (!parseHex4(in, uc)) { |
590 | 0 | return false; |
591 | 0 | } |
592 | | /* |
593 | | For DC00 to DFFF, it should be low surrogates for UTF16. |
594 | | So if it display in the high bits, it's invalid. |
595 | | */ |
596 | 0 | if (uc >= 0xDC00 && uc <= 0xDFFF) { |
597 | 0 | return false; |
598 | 0 | } |
599 | | |
600 | | /* |
601 | | For D800 to DBFF, it's the high surrogates for UTF16. |
602 | | So it's utf-16, there must be another one between 0xDC00 |
603 | | and 0xDFFF. |
604 | | */ |
605 | 0 | if (uc >= 0xD800 && uc <= 0xDBFF) { |
606 | 0 | unsigned uc2; |
607 | |
|
608 | 0 | if (!in.good()) { |
609 | 0 | return false; |
610 | 0 | } |
611 | 0 | c = nextChar(in); |
612 | 0 | if (c != '\\') { |
613 | 0 | return false; |
614 | 0 | } |
615 | | |
616 | 0 | if (!in.good()) { |
617 | 0 | return false; |
618 | 0 | } |
619 | 0 | c = nextChar(in); |
620 | 0 | if (c != 'u') { |
621 | 0 | return false; |
622 | 0 | } |
623 | | |
624 | 0 | if (!parseHex4(in, uc2)) { |
625 | 0 | return false; |
626 | 0 | } |
627 | | /* |
628 | | Now we need the low surrogates for UTF16. It should be |
629 | | within 0xDC00 and 0xDFFF. |
630 | | */ |
631 | 0 | if (uc2 < 0xDC00 || uc2 > 0xDFFF) return false; |
632 | | /* |
633 | | For the character that not in the Basic Multilingual Plan, |
634 | | it's represented as twelve-character, encoding the UTF-16 |
635 | | surrogate pair. |
636 | | UTF16 is between 0x10000 and 0x10FFFF. The high surrogate |
637 | | present the high bits and the low surrogate present the |
638 | | lower 10 bits. |
639 | | For detailed explanation, please refer to: |
640 | | http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf |
641 | | Then it will be converted to UTF8. |
642 | | */ |
643 | 0 | uc = 0x10000 + (((uc & 0x3FF) << 10) | (uc2 & 0x3FF)); |
644 | 0 | } |
645 | | |
646 | | /* |
647 | | Get the length of the unicode. |
648 | | Please refer to http://en.wikipedia.org/wiki/UTF-8#Description. |
649 | | */ |
650 | 0 | if (uc < 0x80) |
651 | 0 | len = 1; |
652 | 0 | else if (uc < 0x800) |
653 | 0 | len = 2; |
654 | 0 | else if (uc < 0x10000) |
655 | 0 | len = 3; |
656 | 0 | else |
657 | 0 | len = 4; |
658 | 0 | out += len; |
659 | | /* |
660 | | Encode it. |
661 | | Please refer to http://en.wikipedia.org/wiki/UTF-8#Description. |
662 | | This part of code has a reference to cJSON. |
663 | | */ |
664 | 0 | switch (len) { |
665 | 0 | case 4: |
666 | 0 | *--out = ((uc | 0x80) & 0xBF); |
667 | 0 | uc >>= 6; |
668 | 0 | [[fallthrough]]; |
669 | 0 | case 3: |
670 | 0 | *--out = ((uc | 0x80) & 0xBF); |
671 | 0 | uc >>= 6; |
672 | 0 | [[fallthrough]]; |
673 | 0 | case 2: |
674 | 0 | *--out = ((uc | 0x80) & 0xBF); |
675 | 0 | uc >>= 6; |
676 | 0 | [[fallthrough]]; |
677 | 0 | case 1: |
678 | | // Mask the first byte according to the standard. |
679 | 0 | *--out = (uc | firstByteMark[len - 1]); |
680 | 0 | } |
681 | 0 | return true; |
682 | 0 | break; |
683 | 0 | } |
684 | 0 | default: |
685 | 0 | return false; |
686 | 0 | break; |
687 | 0 | } |
688 | 0 | } |
689 | | |
690 | | // parse a string |
691 | 582 | bool parseString(std::istream& in) { |
692 | 582 | const int BUFFER_LEN = 4096; |
693 | 582 | if (!writer_.writeStartString()) { |
694 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
695 | 0 | return false; |
696 | 0 | } |
697 | | |
698 | | // write 4KB at a time |
699 | 582 | char buffer[BUFFER_LEN]; |
700 | 582 | int nread = 0; |
701 | 2.20k | while (in.good()) { |
702 | 2.20k | char ch = nextChar(in); |
703 | 2.20k | if (ch == '"') { |
704 | | // write all remaining bytes in the buffer |
705 | 582 | if (nread > 0) { |
706 | 582 | if (!writer_.writeString(buffer, nread)) { |
707 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
708 | 0 | return false; |
709 | 0 | } |
710 | 582 | } |
711 | | // end writing string |
712 | 582 | if (!writer_.writeEndString()) { |
713 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
714 | 0 | return false; |
715 | 0 | } |
716 | 582 | return true; |
717 | 1.61k | } else if (ch == '\\') { |
718 | | // this is a escape char |
719 | 0 | char escape_buffer[5]; // buffer for escape |
720 | 0 | int len; |
721 | 0 | if (!parseEscape(in, escape_buffer, len)) { |
722 | 0 | err_ = JsonbErrType::E_INVALID_STR; |
723 | 0 | return false; |
724 | 0 | } |
725 | | |
726 | | // Write each char to the buffer |
727 | 0 | for (int i = 0; i != len; ++i) { |
728 | 0 | buffer[nread++] = escape_buffer[i]; |
729 | 0 | if (nread == BUFFER_LEN) { |
730 | 0 | if (!writer_.writeString(buffer, nread)) { |
731 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
732 | 0 | return false; |
733 | 0 | } |
734 | 0 | nread = 0; |
735 | 0 | } |
736 | 0 | } |
737 | 1.61k | } else { |
738 | | // just a char |
739 | 1.61k | buffer[nread++] = ch; |
740 | 1.61k | if (nread == BUFFER_LEN) { |
741 | | // flush buffer |
742 | 0 | if (!writer_.writeString(buffer, nread)) { |
743 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
744 | 0 | return false; |
745 | 0 | } |
746 | 0 | nread = 0; |
747 | 0 | } |
748 | 1.61k | } |
749 | 2.20k | } |
750 | | |
751 | 0 | err_ = JsonbErrType::E_INVALID_STR; |
752 | 0 | return false; |
753 | 582 | } |
754 | | |
755 | | // parse a number |
756 | | // Number format can be hex, octal, or decimal (including float). |
757 | | // Only decimal can have (+/-) sign prefix. |
758 | 1.15k | bool parseNumber(std::istream& in) { |
759 | 1.15k | bool ret = false; |
760 | 1.15k | switch (in.peek()) { |
761 | 0 | case '0': { |
762 | 0 | skipChar(in); |
763 | |
|
764 | 0 | if (in.peek() == 'x' || in.peek() == 'X') { |
765 | 0 | skipChar(in); |
766 | 0 | ret = parseHex(in); |
767 | 0 | } else if (in.peek() == '.') { |
768 | 0 | skipChar(in); // remove '.' |
769 | 0 | num_buf_[0] = '.'; |
770 | 0 | ret = parseDouble(in, num_buf_ + 1); |
771 | 0 | } else { |
772 | 0 | ret = parseOctal(in); |
773 | 0 | } |
774 | |
|
775 | 0 | break; |
776 | 0 | } |
777 | 0 | case '-': { |
778 | 0 | skipChar(in); |
779 | 0 | ret = parseDecimal(in, true); |
780 | 0 | break; |
781 | 0 | } |
782 | 0 | case '+': |
783 | 0 | skipChar(in); |
784 | | // fall through |
785 | 1.15k | default: |
786 | 1.15k | ret = parseDecimal(in); |
787 | 1.15k | break; |
788 | 1.15k | } |
789 | | |
790 | 1.15k | return ret; |
791 | 1.15k | } |
792 | | |
793 | | // parse a number in hex format |
794 | 0 | bool parseHex(std::istream& in) { |
795 | 0 | uint64_t val = 0; |
796 | 0 | int num_digits; |
797 | 0 | if (0 == (num_digits = parseHexHelper(in, val))) { |
798 | 0 | err_ = JsonbErrType::E_INVALID_HEX; |
799 | 0 | return false; |
800 | 0 | } |
801 | | |
802 | 0 | int size = 0; |
803 | 0 | if (num_digits <= 2) { |
804 | 0 | size = writer_.writeInt8((int8_t)val); |
805 | 0 | } else if (num_digits <= 4) { |
806 | 0 | size = writer_.writeInt16((int16_t)val); |
807 | 0 | } else if (num_digits <= 8) { |
808 | 0 | size = writer_.writeInt32((int32_t)val); |
809 | 0 | } else if (num_digits <= 16) { |
810 | 0 | size = writer_.writeInt64(val); |
811 | 0 | } else { |
812 | 0 | err_ = JsonbErrType::E_HEX_OVERFLOW; |
813 | 0 | return false; |
814 | 0 | } |
815 | | |
816 | 0 | if (size == 0) { |
817 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
818 | 0 | return false; |
819 | 0 | } |
820 | | |
821 | 0 | return true; |
822 | 0 | } |
823 | | |
824 | | // parse a number in octal format |
825 | 0 | bool parseOctal(std::istream& in) { |
826 | 0 | int64_t val = 0; |
827 | 0 | char ch = in.peek(); |
828 | 0 | while (in.good() && !strchr(kJsonDelim, ch)) { |
829 | 0 | if (ch >= '0' && ch <= '7') { |
830 | 0 | val = val * 8 + (ch - '0'); |
831 | 0 | } else { |
832 | 0 | err_ = JsonbErrType::E_INVALID_OCTAL; |
833 | 0 | return false; |
834 | 0 | } |
835 | | |
836 | | // check if the number overflows |
837 | 0 | if (val < 0) { |
838 | 0 | err_ = JsonbErrType::E_OCTAL_OVERFLOW; |
839 | 0 | return false; |
840 | 0 | } |
841 | | |
842 | 0 | skipChar(in); |
843 | 0 | ch = in.peek(); |
844 | 0 | } |
845 | | |
846 | 0 | int size = 0; |
847 | 0 | if (val <= std::numeric_limits<int8_t>::max()) { |
848 | 0 | size = writer_.writeInt8((int8_t)val); |
849 | 0 | } else if (val <= std::numeric_limits<int16_t>::max()) { |
850 | 0 | size = writer_.writeInt16((int16_t)val); |
851 | 0 | } else if (val <= std::numeric_limits<int32_t>::max()) { |
852 | 0 | size = writer_.writeInt32((int32_t)val); |
853 | 0 | } else { // val <= INT64_MAX |
854 | 0 | size = writer_.writeInt64(val); |
855 | 0 | } |
856 | |
|
857 | 0 | if (size == 0) { |
858 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
859 | 0 | return false; |
860 | 0 | } |
861 | | |
862 | 0 | return true; |
863 | 0 | } |
864 | | |
865 | | // parse a number in decimal (including float) |
866 | 1.15k | bool parseDecimal(std::istream& in, bool neg = false) { |
867 | 1.15k | char ch = 0; |
868 | 1.15k | while (in.good() && (ch = in.peek()) == '0') skipChar(in); |
869 | | |
870 | 1.15k | char* pbuf = num_buf_; |
871 | 1.15k | if (neg) *(pbuf++) = '-'; |
872 | | |
873 | 1.15k | char* save_pos = pbuf; |
874 | 5.18k | while (in.good() && !strchr(kJsonDelim, ch)) { |
875 | 4.40k | *(pbuf++) = ch; |
876 | 4.40k | if (pbuf == end_buf_) { |
877 | 0 | err_ = JsonbErrType::E_DECIMAL_OVERFLOW; |
878 | 0 | return false; |
879 | 0 | } |
880 | | |
881 | 4.40k | if (ch == '.') { |
882 | 282 | skipChar(in); // remove '.' |
883 | 282 | return parseDouble(in, pbuf); |
884 | 4.12k | } else if (ch == 'E' || ch == 'e') { |
885 | 0 | skipChar(in); // remove 'E' |
886 | 0 | return parseExponent(in, pbuf); |
887 | 4.12k | } else if (ch < '0' || ch > '9') { |
888 | 92 | err_ = JsonbErrType::E_INVALID_DECIMAL; |
889 | 92 | return false; |
890 | 92 | } |
891 | | |
892 | 4.02k | skipChar(in); |
893 | 4.02k | ch = in.peek(); |
894 | 4.02k | } |
895 | 777 | if (save_pos == pbuf) { |
896 | 0 | err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input |
897 | 0 | return false; |
898 | 0 | } |
899 | | |
900 | 777 | *pbuf = 0; // set null-terminator |
901 | 777 | StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; |
902 | 777 | int128_t val = |
903 | 777 | StringParser::string_to_int<int128_t>(num_buf_, pbuf - num_buf_, &parse_result); |
904 | 777 | if (parse_result != StringParser::PARSE_SUCCESS) { |
905 | 0 | VLOG_ROW << "debug string_to_int error for " << num_buf_ << " val=" << val |
906 | 0 | << " parse_result=" << parse_result; |
907 | 0 | err_ = JsonbErrType::E_DECIMAL_OVERFLOW; |
908 | 0 | return false; |
909 | 0 | } |
910 | | |
911 | 777 | int size = 0; |
912 | 777 | if (val >= std::numeric_limits<int8_t>::min() && |
913 | 777 | val <= std::numeric_limits<int8_t>::max()) { |
914 | 357 | size = writer_.writeInt8((int8_t)val); |
915 | 420 | } else if (val >= std::numeric_limits<int16_t>::min() && |
916 | 420 | val <= std::numeric_limits<int16_t>::max()) { |
917 | 294 | size = writer_.writeInt16((int16_t)val); |
918 | 294 | } else if (val >= std::numeric_limits<int32_t>::min() && |
919 | 126 | val <= std::numeric_limits<int32_t>::max()) { |
920 | 63 | size = writer_.writeInt32((int32_t)val); |
921 | 63 | } else if (val >= std::numeric_limits<int64_t>::min() && |
922 | 63 | val <= std::numeric_limits<int64_t>::max()) { |
923 | 63 | size = writer_.writeInt64((int64_t)val); |
924 | 63 | } else { // INT128 |
925 | 0 | size = writer_.writeInt128(val); |
926 | 0 | } |
927 | | |
928 | 777 | if (size == 0) { |
929 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
930 | 0 | return false; |
931 | 0 | } |
932 | | |
933 | 777 | return true; |
934 | 777 | } |
935 | | |
936 | | // parse IEEE745 double precision |
937 | 282 | bool parseDouble(std::istream& in, char* pbuf) { |
938 | 282 | char* save_pos = pbuf; |
939 | 282 | char ch = in.peek(); |
940 | 824 | while (in.good() && !strchr(kJsonDelim, ch)) { |
941 | 553 | *(pbuf++) = ch; |
942 | 553 | if (pbuf == end_buf_) { |
943 | 0 | err_ = JsonbErrType::E_DOUBLE_OVERFLOW; |
944 | 0 | return false; |
945 | 0 | } |
946 | | |
947 | 553 | if (ch == 'e' || ch == 'E') { |
948 | 0 | skipChar(in); // remove 'E' |
949 | 0 | return parseExponent(in, pbuf); |
950 | 553 | } else if (ch < '0' || ch > '9') { |
951 | 11 | err_ = JsonbErrType::E_INVALID_DECIMAL; |
952 | 11 | return false; |
953 | 11 | } |
954 | | |
955 | 542 | skipChar(in); |
956 | 542 | ch = in.peek(); |
957 | 542 | } |
958 | 271 | if (save_pos == pbuf) { |
959 | 0 | err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input |
960 | 0 | return false; |
961 | 0 | } |
962 | | |
963 | 271 | *pbuf = 0; // set null-terminator |
964 | 271 | return internConvertBufferToDouble(num_buf_, pbuf - num_buf_); |
965 | 271 | } |
966 | | |
967 | | // parse the exponent part of a double number |
968 | 0 | bool parseExponent(std::istream& in, char* pbuf) { |
969 | 0 | char ch = in.peek(); |
970 | 0 | if (in.good()) { |
971 | 0 | if (ch == '+' || ch == '-') { |
972 | 0 | *(pbuf++) = ch; |
973 | 0 | if (pbuf == end_buf_) { |
974 | 0 | err_ = JsonbErrType::E_DOUBLE_OVERFLOW; |
975 | 0 | return false; |
976 | 0 | } |
977 | 0 | skipChar(in); |
978 | 0 | ch = in.peek(); |
979 | 0 | } |
980 | 0 | } |
981 | | |
982 | 0 | char* save_pos = pbuf; |
983 | 0 | while (in.good() && !strchr(kJsonDelim, ch)) { |
984 | 0 | *(pbuf++) = ch; |
985 | 0 | if (pbuf == end_buf_) { |
986 | 0 | err_ = JsonbErrType::E_DOUBLE_OVERFLOW; |
987 | 0 | return false; |
988 | 0 | } |
989 | | |
990 | 0 | if (ch < '0' || ch > '9') { |
991 | 0 | err_ = JsonbErrType::E_INVALID_EXPONENT; |
992 | 0 | return false; |
993 | 0 | } |
994 | | |
995 | 0 | skipChar(in); |
996 | 0 | ch = in.peek(); |
997 | 0 | } |
998 | 0 | if (save_pos == pbuf) { |
999 | 0 | err_ = JsonbErrType::E_INVALID_EXPONENT; // empty input |
1000 | 0 | return false; |
1001 | 0 | } |
1002 | | |
1003 | 0 | *pbuf = 0; // set null-terminator |
1004 | 0 | return internConvertBufferToDouble(num_buf_, pbuf - num_buf_); |
1005 | 0 | } |
1006 | | |
1007 | | // call system function to parse double to string |
1008 | 271 | bool internConvertBufferToDouble(char* num_buf_, int len) { |
1009 | 271 | StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; |
1010 | 271 | double val = StringParser::string_to_float<double>(num_buf_, len, &parse_result); |
1011 | 271 | if (parse_result != StringParser::PARSE_SUCCESS) { |
1012 | 0 | VLOG_ROW << "debug string_to_float error for " << num_buf_ << " val=" << val |
1013 | 0 | << " parse_result=" << parse_result; |
1014 | 0 | err_ = JsonbErrType::E_DECIMAL_OVERFLOW; |
1015 | 0 | return false; |
1016 | 0 | } |
1017 | | |
1018 | 271 | if (writer_.writeDouble(val) == 0) { |
1019 | 0 | err_ = JsonbErrType::E_OUTPUT_FAIL; |
1020 | 0 | return false; |
1021 | 0 | } |
1022 | | |
1023 | 271 | return true; |
1024 | 271 | } |
1025 | | |
1026 | 6.49k | void trim(std::istream& in) { |
1027 | 7.79k | while (in.good() && strchr(kWhiteSpace, in.peek())) { |
1028 | 1.30k | skipChar(in); |
1029 | 1.30k | } |
1030 | 6.49k | } |
1031 | | |
1032 | | /* |
1033 | | * Helper functions to keep track of characters read. |
1034 | | * Do not rely on std::istream's tellg() which may not be implemented. |
1035 | | */ |
1036 | | |
1037 | 6.86k | char nextChar(std::istream& in) { |
1038 | 6.86k | ++stream_pos_; |
1039 | 6.86k | return in.get(); |
1040 | 6.86k | } |
1041 | | |
1042 | 8.36k | void skipChar(std::istream& in) { |
1043 | 8.36k | ++stream_pos_; |
1044 | 8.36k | in.ignore(); |
1045 | 8.36k | } |
1046 | | |
1047 | | private: |
1048 | | JsonbWriterT<OS_TYPE> writer_; |
1049 | | uint32_t stream_pos_; |
1050 | | JsonbErrType err_; |
1051 | | char num_buf_[512]; // buffer to hold number string |
1052 | | const char* end_buf_ = num_buf_ + sizeof(num_buf_) - 1; |
1053 | | uint32_t nesting_lvl_ = 0; |
1054 | | }; |
1055 | | |
1056 | | typedef JsonbParserT<JsonbOutStream> JsonbParser; |
1057 | | |
1058 | | } // namespace doris |
1059 | | |
1060 | | #endif // JSONB_JSONBJSONPARSER_H |