Coverage Report

Created: 2025-07-25 10:29

/root/doris/be/src/util/jsonb_parser.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserT (template) and JsonbParser.
13
 *
14
 * JsonbParserT is a template class which implements a JSON parser.
15
 * JsonbParserT parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserT creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserT with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserT parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserT object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#ifndef JSONB_JSONBJSONPARSER_H
60
#define JSONB_JSONBJSONPARSER_H
61
62
#include <cmath>
63
#include <limits>
64
65
#include "jsonb_document.h"
66
#include "jsonb_error.h"
67
#include "jsonb_writer.h"
68
#include "string_parser.hpp"
69
70
namespace doris {
71
72
const char* const kJsonDelim = " ,]}\t\r\n";
73
const char* const kWhiteSpace = " \t\n\r";
74
75
/*
76
 * Template JsonbParserT
77
 */
78
template <class OS_TYPE>
79
class JsonbParserT {
80
public:
81
879
    JsonbParserT() : stream_pos_(0), err_(JsonbErrType::E_NONE) {}
82
83
    explicit JsonbParserT(OS_TYPE& os) : writer_(os), stream_pos_(0), err_(JsonbErrType::E_NONE) {}
84
85
    // parse a UTF-8 JSON string
86
    bool parse(const std::string& str, hDictInsert handler = nullptr) {
87
        return parse(str.c_str(), (unsigned int)str.size(), handler);
88
    }
89
90
    // parse a UTF-8 JSON c-style string (NULL terminated)
91
    bool parse(const char* c_str, hDictInsert handler = nullptr) {
92
        return parse(c_str, (unsigned int)strlen(c_str), handler);
93
    }
94
95
    // parse a UTF-8 JSON string with length
96
1.08k
    bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
97
1.08k
        if (!pch || len == 0) {
98
0
            err_ = JsonbErrType::E_EMPTY_DOCUMENT;
99
0
            return false;
100
0
        }
101
102
1.08k
        JsonbInBuffer sb(pch, len);
103
1.08k
        std::istream in(&sb);
104
1.08k
        return parse(in, handler);
105
1.08k
    }
106
107
    // parse UTF-8 JSON text from an input stream
108
1.08k
    bool parse(std::istream& in, hDictInsert handler = nullptr) {
109
1.08k
        bool res = false;
110
1.08k
        err_ = JsonbErrType::E_NONE;
111
1.08k
        stream_pos_ = 0;
112
113
        // reset output stream
114
1.08k
        writer_.reset();
115
116
1.08k
        trim(in);
117
118
        // TODO(wzy): parsePrimitive should be implemented
119
1.08k
        if (in.peek() == '{') {
120
126
            skipChar(in);
121
126
            res = parseObject(in, handler);
122
955
        } else if (in.peek() == '[') {
123
391
            skipChar(in);
124
391
            res = parseArray(in, handler);
125
564
        } else {
126
564
            res = parsePrimitive(in, handler);
127
564
            if (!res) err_ = handle_parse_failure(in);
128
564
        }
129
130
1.08k
        trim(in);
131
1.08k
        if (res && !in.eof()) {
132
0
            err_ = JsonbErrType::E_INVALID_DOCU;
133
0
            return false;
134
0
        }
135
136
1.08k
        return res;
137
1.08k
    }
138
139
2.02k
    JsonbWriterT<OS_TYPE>& getWriter() { return writer_; }
140
141
68
    JsonbErrType getErrorCode() { return err_; }
142
143
    JsonbErrInfo getErrorInfo() {
144
        assert(err_ < JsonbErrType::E_NUM_ERRORS);
145
146
        JsonbErrInfo err_info;
147
148
        // stream_pos_ always points to the next char, so err_pos is 1-based
149
        err_info.err_pos = stream_pos_;
150
        err_info.err_msg = JsonbErrMsg::getErrMsg(err_);
151
152
        return err_info;
153
    }
154
155
    // clear error code
156
    void clearErr() { err_ = JsonbErrType::E_NONE; }
157
158
private:
159
0
    JsonbErrType handle_parse_value_failure(bool parse_res, std::istream& in) {
160
0
        if (parse_res) {
161
0
            trim(in);
162
0
            if (!in.good()) {
163
0
                return JsonbErrType::E_INVALID_DOCU_COMPAT;
164
0
            }
165
0
        }
166
0
        return JsonbErrType::E_INVALID_DOCU;
167
0
        ;
168
0
    }
169
170
    // In case json is determined to be invalid at top level,
171
    // try to parse literal values.
172
    // We return a different error code E_INVALID_DOCU_COMPAT
173
    // in case the input json contains these values.
174
    // Returning a different error code will cause an
175
    // auditing on the caller.
176
    // This is mainly done because 8.0 JSON_VALID considers
177
    // this as a valid input.
178
46
    JsonbErrType handle_parse_failure(std::istream& in) {
179
46
        JsonbErrType error = JsonbErrType::E_INVALID_DOCU;
180
46
        if (!writer_.writeStartArray()) {
181
0
            return error;
182
0
        }
183
184
46
        switch (in.peek()) {
185
0
        case 'n':
186
0
            skipChar(in);
187
0
            error = handle_parse_value_failure(parseNull(in), in);
188
0
            break;
189
0
        case 't':
190
0
            skipChar(in);
191
0
            error = handle_parse_value_failure(parseTrue(in), in);
192
0
            break;
193
0
        case 'f':
194
0
            skipChar(in);
195
0
            error = handle_parse_value_failure(parseFalse(in), in);
196
0
            break;
197
0
        case '"':
198
0
            skipChar(in);
199
0
            error = handle_parse_value_failure(parseString(in), in);
200
0
            break;
201
46
        default:
202
46
            if (parseNumber(in)) {
203
0
                trim(in);
204
0
                if (in.eof()) {
205
0
                    error = JsonbErrType::E_INVALID_DOCU_COMPAT;
206
0
                }
207
0
            }
208
46
        }
209
46
        if (!writer_.writeEndArray()) {
210
0
            return error;
211
0
        }
212
213
46
        return error;
214
46
    }
215
216
    // parse primitive
217
564
    bool parsePrimitive(std::istream& in, hDictInsert handler) {
218
564
        bool res = false;
219
564
        switch (in.peek()) {
220
56
        case 'n':
221
56
            skipChar(in);
222
56
            res = parseNull(in);
223
56
            break;
224
56
        case 't':
225
56
            skipChar(in);
226
56
            res = parseTrue(in);
227
56
            break;
228
56
        case 'f':
229
56
            skipChar(in);
230
56
            res = parseFalse(in);
231
56
            break;
232
64
        case '"':
233
64
            skipChar(in);
234
64
            res = parseString(in);
235
64
            break;
236
332
        default:
237
332
            res = parseNumber(in);
238
564
        }
239
240
564
        return res;
241
564
    }
242
243
    // parse a JSON object (comma-separated list of key-value pairs)
244
215
    bool parseObject(std::istream& in, hDictInsert handler) {
245
215
        if (!writer_.writeStartObject()) {
246
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
247
0
            return false;
248
0
        }
249
250
215
        trim(in);
251
252
215
        if (in.peek() == '}') {
253
59
            skipChar(in);
254
            // empty object
255
59
            if (!writer_.writeEndObject()) {
256
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
257
0
                return false;
258
0
            }
259
59
            return true;
260
59
        }
261
262
301
        while (in.good()) {
263
301
            if (nextChar(in) != '"') {
264
11
                err_ = JsonbErrType::E_INVALID_OBJ;
265
11
                return false;
266
11
            }
267
268
290
            if (!parseKVPair(in, handler)) {
269
0
                return false;
270
0
            }
271
272
290
            trim(in);
273
274
290
            char ch = nextChar(in);
275
290
            if (ch == '}') {
276
                // end of the object
277
145
                if (!writer_.writeEndObject()) {
278
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
279
0
                    return false;
280
0
                }
281
145
                return true;
282
145
            } else if (ch != ',') {
283
0
                err_ = JsonbErrType::E_INVALID_OBJ;
284
0
                return false;
285
0
            }
286
287
145
            trim(in);
288
145
        }
289
290
0
        err_ = JsonbErrType::E_INVALID_OBJ;
291
0
        return false;
292
156
    }
293
294
    // parse a JSON array (comma-separated list of values)
295
391
    bool parseArray(std::istream& in, hDictInsert handler) {
296
391
        if (!writer_.writeStartArray()) {
297
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
298
0
            return false;
299
0
        }
300
301
391
        trim(in);
302
303
391
        if (in.peek() == ']') {
304
56
            skipChar(in);
305
            // empty array
306
56
            if (!writer_.writeEndArray()) {
307
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
308
0
                return false;
309
0
            }
310
56
            return true;
311
56
        }
312
313
1.25k
        while (in.good()) {
314
1.25k
            if (!parseValue(in, handler)) {
315
11
                return false;
316
11
            }
317
318
1.24k
            trim(in);
319
320
1.24k
            char ch = nextChar(in);
321
1.24k
            if (ch == ']') {
322
                // end of the array
323
324
                if (!writer_.writeEndArray()) {
324
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
325
0
                    return false;
326
0
                }
327
324
                return true;
328
921
            } else if (ch != ',') {
329
0
                err_ = JsonbErrType::E_INVALID_ARR;
330
0
                return false;
331
0
            }
332
333
921
            trim(in);
334
921
        }
335
336
0
        err_ = JsonbErrType::E_INVALID_ARR;
337
0
        return false;
338
335
    }
339
340
    // parse a key-value pair, separated by ":"
341
290
    bool parseKVPair(std::istream& in, hDictInsert handler) {
342
290
        if (parseKey(in, handler) && parseValue(in, handler)) {
343
290
            return true;
344
290
        }
345
346
0
        return false;
347
290
    }
348
349
    // parse a key (must be string)
350
290
    bool parseKey(std::istream& in, hDictInsert handler) {
351
290
        char key[JsonbKeyValue::sMaxKeyLen];
352
290
        int key_len = 0;
353
870
        while (in.good() && in.peek() != '"' && key_len < JsonbKeyValue::sMaxKeyLen) {
354
580
            char ch = nextChar(in);
355
580
            if (ch == '\\') {
356
0
                char escape_buffer[5]; // buffer for escape
357
0
                int len;
358
0
                if (!parseEscape(in, escape_buffer, len)) {
359
0
                    err_ = JsonbErrType::E_INVALID_KEY_STRING;
360
0
                    return false;
361
0
                }
362
0
                if (key_len + len >= JsonbKeyValue::sMaxKeyLen) {
363
0
                    err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
364
0
                    return false;
365
0
                }
366
0
                memcpy(key + key_len, escape_buffer, len);
367
0
                key_len += len;
368
580
            } else {
369
580
                key[key_len++] = ch;
370
580
            }
371
580
        }
372
373
290
        if (!in.good() || in.peek() != '"' || key_len == 0) {
374
0
            if (key_len == JsonbKeyValue::sMaxKeyLen)
375
0
                err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
376
0
            else
377
0
                err_ = JsonbErrType::E_INVALID_KEY_STRING;
378
0
            return false;
379
0
        }
380
381
290
        skipChar(in); // discard '"'
382
383
290
        int key_id = -1;
384
290
        if (handler) {
385
0
            key_id = handler(key, key_len);
386
0
        }
387
388
290
        if (key_id < 0) {
389
290
            writer_.writeKey(key, key_len);
390
290
        } else {
391
0
            writer_.writeKey(key_id);
392
0
        }
393
394
290
        trim(in);
395
396
290
        if (nextChar(in) != ':') {
397
0
            err_ = JsonbErrType::E_INVALID_OBJ;
398
0
            return false;
399
0
        }
400
401
290
        trim(in);
402
290
        if (!in.good()) {
403
0
            err_ = JsonbErrType::E_INVALID_OBJ;
404
0
            return false;
405
0
        }
406
407
290
        return true;
408
290
    }
409
410
    // parse a value
411
1.54k
    bool parseValue(std::istream& in, hDictInsert handler) {
412
1.54k
        bool res = false;
413
414
1.54k
        switch (in.peek()) {
415
0
        case 'N':
416
102
        case 'n': {
417
102
            skipChar(in);
418
102
            res = parseNull(in);
419
102
            break;
420
0
        }
421
0
        case 'T':
422
102
        case 't': {
423
102
            skipChar(in);
424
102
            res = parseTrue(in);
425
102
            break;
426
0
        }
427
0
        case 'F':
428
102
        case 'f': {
429
102
            skipChar(in);
430
102
            res = parseFalse(in);
431
102
            break;
432
0
        }
433
469
        case '"': {
434
469
            skipChar(in);
435
469
            res = parseString(in);
436
469
            break;
437
0
        }
438
89
        case '{': {
439
89
            skipChar(in);
440
89
            ++nesting_lvl_;
441
89
            if (nesting_lvl_ >= MaxNestingLevel) {
442
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
443
0
                return false;
444
0
            }
445
89
            res = parseObject(in, handler);
446
89
            if (res) {
447
89
                --nesting_lvl_;
448
89
            }
449
89
            break;
450
89
        }
451
0
        case '[': {
452
0
            skipChar(in);
453
0
            ++nesting_lvl_;
454
0
            if (nesting_lvl_ >= MaxNestingLevel) {
455
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
456
0
                return false;
457
0
            }
458
0
            res = parseArray(in, handler);
459
0
            if (res) {
460
0
                --nesting_lvl_;
461
0
            }
462
0
            break;
463
0
        }
464
682
        default: {
465
682
            res = parseNumber(in);
466
682
            break;
467
0
        }
468
1.54k
        }
469
470
1.54k
        return res;
471
1.54k
    }
472
473
    // parse NULL value
474
158
    bool parseNull(std::istream& in) {
475
158
        if (tolower(nextChar(in)) == 'u' && tolower(nextChar(in)) == 'l' &&
476
158
            tolower(nextChar(in)) == 'l') {
477
158
            writer_.writeNull();
478
158
            return true;
479
158
        }
480
481
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
482
0
        return false;
483
158
    }
484
485
    // parse TRUE value
486
158
    bool parseTrue(std::istream& in) {
487
158
        if (tolower(nextChar(in)) == 'r' && tolower(nextChar(in)) == 'u' &&
488
158
            tolower(nextChar(in)) == 'e') {
489
158
            writer_.writeBool(true);
490
158
            return true;
491
158
        }
492
493
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
494
0
        return false;
495
158
    }
496
497
    // parse FALSE value
498
158
    bool parseFalse(std::istream& in) {
499
158
        if (tolower(nextChar(in)) == 'a' && tolower(nextChar(in)) == 'l' &&
500
158
            tolower(nextChar(in)) == 's' && tolower(nextChar(in)) == 'e') {
501
158
            writer_.writeBool(false);
502
158
            return true;
503
158
        }
504
505
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
506
0
        return false;
507
158
    }
508
509
    /*
510
    This is a helper function to parse the hex value. hex_num means the
511
    number of digits needed to be parsed. If less than zero, then it will
512
    consider all the characters between current and any character in JsonDelim.
513
  */
514
0
    unsigned parseHexHelper(std::istream& in, uint64_t& val, unsigned hex_num = 17) {
515
        // We can't read more than 17 digits, so when read 17 digits, it's overflow
516
0
        val = 0;
517
0
        unsigned num_digits = 0;
518
0
        char ch = tolower(in.peek());
519
0
        while (in.good() && !strchr(kJsonDelim, ch) && num_digits != hex_num) {
520
0
            if (ch >= '0' && ch <= '9') {
521
0
                val = (val << 4) + (ch - '0');
522
0
            } else if (ch >= 'a' && ch <= 'f') {
523
0
                val = (val << 4) + (ch - 'a' + 10);
524
0
            } else {
525
                // unrecognized hex digit
526
0
                return 0;
527
0
            }
528
0
            skipChar(in);
529
0
            ch = tolower(in.peek());
530
0
            ++num_digits;
531
0
        }
532
0
        return num_digits;
533
0
    }
534
535
    // parse HEX value
536
0
    bool parseHex4(std::istream& in, unsigned& h) {
537
0
        uint64_t val;
538
0
        if (4 == parseHexHelper(in, val, 4)) {
539
0
            h = (unsigned)val;
540
0
            return true;
541
0
        }
542
0
        return false;
543
0
    }
544
545
    /*
546
     parse Escape char.
547
  */
548
0
    bool parseEscape(std::istream& in, char* out, int& len) {
549
        /*
550
      This is extracted from cJSON implementation.
551
      This is about the mask of the first byte in UTF-8.
552
      The mask is defined in:
553
      http://en.wikipedia.org/wiki/UTF-8#Description
554
    */
555
0
        const unsigned char firstByteMark[6] = {0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
556
0
        if (!in.good()) {
557
0
            return false;
558
0
        }
559
0
        char c = nextChar(in);
560
0
        len = 1;
561
0
        switch (c) {
562
        // \" \\ \/  \b \f \n \r \t
563
0
        case '"':
564
0
            *out = '"';
565
0
            return true;
566
0
        case '\\':
567
0
            *out = '\\';
568
0
            return true;
569
0
        case '/':
570
0
            *out = '/';
571
0
            return true;
572
0
        case 'b':
573
0
            *out = '\b';
574
0
            return true;
575
0
        case 'f':
576
0
            *out = '\f';
577
0
            return true;
578
0
        case 'n':
579
0
            *out = '\n';
580
0
            return true;
581
0
        case 'r':
582
0
            *out = '\r';
583
0
            return true;
584
0
        case 't':
585
0
            *out = '\t';
586
0
            return true;
587
0
        case 'u': {
588
0
            unsigned uc;
589
0
            if (!parseHex4(in, uc)) {
590
0
                return false;
591
0
            }
592
            /*
593
          For DC00 to DFFF, it should be low surrogates for UTF16.
594
          So if it display in the high bits, it's invalid.
595
        */
596
0
            if (uc >= 0xDC00 && uc <= 0xDFFF) {
597
0
                return false;
598
0
            }
599
600
            /*
601
          For D800 to DBFF, it's the high surrogates for UTF16.
602
          So it's utf-16, there must be another one between 0xDC00
603
          and 0xDFFF.
604
        */
605
0
            if (uc >= 0xD800 && uc <= 0xDBFF) {
606
0
                unsigned uc2;
607
608
0
                if (!in.good()) {
609
0
                    return false;
610
0
                }
611
0
                c = nextChar(in);
612
0
                if (c != '\\') {
613
0
                    return false;
614
0
                }
615
616
0
                if (!in.good()) {
617
0
                    return false;
618
0
                }
619
0
                c = nextChar(in);
620
0
                if (c != 'u') {
621
0
                    return false;
622
0
                }
623
624
0
                if (!parseHex4(in, uc2)) {
625
0
                    return false;
626
0
                }
627
                /*
628
            Now we need the low surrogates for UTF16. It should be
629
            within 0xDC00 and 0xDFFF.
630
          */
631
0
                if (uc2 < 0xDC00 || uc2 > 0xDFFF) return false;
632
                /*
633
            For the character that not in the Basic Multilingual Plan,
634
            it's represented as twelve-character, encoding the UTF-16
635
            surrogate pair.
636
            UTF16 is between 0x10000 and 0x10FFFF. The high surrogate
637
            present the high bits and the low surrogate present the
638
            lower 10 bits.
639
            For detailed explanation, please refer to:
640
            http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
641
            Then it will be converted to UTF8.
642
          */
643
0
                uc = 0x10000 + (((uc & 0x3FF) << 10) | (uc2 & 0x3FF));
644
0
            }
645
646
            /*
647
          Get the length of the unicode.
648
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
649
        */
650
0
            if (uc < 0x80)
651
0
                len = 1;
652
0
            else if (uc < 0x800)
653
0
                len = 2;
654
0
            else if (uc < 0x10000)
655
0
                len = 3;
656
0
            else
657
0
                len = 4;
658
0
            out += len;
659
            /*
660
          Encode it.
661
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
662
          This part of code has a reference to cJSON.
663
        */
664
0
            switch (len) {
665
0
            case 4:
666
0
                *--out = ((uc | 0x80) & 0xBF);
667
0
                uc >>= 6;
668
0
                [[fallthrough]];
669
0
            case 3:
670
0
                *--out = ((uc | 0x80) & 0xBF);
671
0
                uc >>= 6;
672
0
                [[fallthrough]];
673
0
            case 2:
674
0
                *--out = ((uc | 0x80) & 0xBF);
675
0
                uc >>= 6;
676
0
                [[fallthrough]];
677
0
            case 1:
678
                // Mask the first byte according to the standard.
679
0
                *--out = (uc | firstByteMark[len - 1]);
680
0
            }
681
0
            return true;
682
0
            break;
683
0
        }
684
0
        default:
685
0
            return false;
686
0
            break;
687
0
        }
688
0
    }
689
690
    // parse a string
691
533
    bool parseString(std::istream& in) {
692
533
        const int BUFFER_LEN = 4096;
693
533
        if (!writer_.writeStartString()) {
694
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
695
0
            return false;
696
0
        }
697
698
        // write 4KB at a time
699
533
        char buffer[BUFFER_LEN];
700
533
        int nread = 0;
701
2.01k
        while (in.good()) {
702
2.01k
            char ch = nextChar(in);
703
2.01k
            if (ch == '"') {
704
                // write all remaining bytes in the buffer
705
533
                if (nread > 0) {
706
533
                    if (!writer_.writeString(buffer, nread)) {
707
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
708
0
                        return false;
709
0
                    }
710
533
                }
711
                // end writing string
712
533
                if (!writer_.writeEndString()) {
713
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
714
0
                    return false;
715
0
                }
716
533
                return true;
717
1.47k
            } else if (ch == '\\') {
718
                // this is a escape char
719
0
                char escape_buffer[5]; // buffer for escape
720
0
                int len;
721
0
                if (!parseEscape(in, escape_buffer, len)) {
722
0
                    err_ = JsonbErrType::E_INVALID_STR;
723
0
                    return false;
724
0
                }
725
726
                // Write each char to the buffer
727
0
                for (int i = 0; i != len; ++i) {
728
0
                    buffer[nread++] = escape_buffer[i];
729
0
                    if (nread == BUFFER_LEN) {
730
0
                        if (!writer_.writeString(buffer, nread)) {
731
0
                            err_ = JsonbErrType::E_OUTPUT_FAIL;
732
0
                            return false;
733
0
                        }
734
0
                        nread = 0;
735
0
                    }
736
0
                }
737
1.47k
            } else {
738
                // just a char
739
1.47k
                buffer[nread++] = ch;
740
1.47k
                if (nread == BUFFER_LEN) {
741
                    // flush buffer
742
0
                    if (!writer_.writeString(buffer, nread)) {
743
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
744
0
                        return false;
745
0
                    }
746
0
                    nread = 0;
747
0
                }
748
1.47k
            }
749
2.01k
        }
750
751
0
        err_ = JsonbErrType::E_INVALID_STR;
752
0
        return false;
753
533
    }
754
755
    // parse a number
756
    // Number format can be hex, octal, or decimal (including float).
757
    // Only decimal can have (+/-) sign prefix.
758
1.06k
    bool parseNumber(std::istream& in) {
759
1.06k
        bool ret = false;
760
1.06k
        switch (in.peek()) {
761
0
        case '0': {
762
0
            skipChar(in);
763
764
0
            if (in.peek() == 'x' || in.peek() == 'X') {
765
0
                skipChar(in);
766
0
                ret = parseHex(in);
767
0
            } else if (in.peek() == '.') {
768
0
                skipChar(in); // remove '.'
769
0
                num_buf_[0] = '.';
770
0
                ret = parseDouble(in, num_buf_ + 1);
771
0
            } else {
772
0
                ret = parseOctal(in);
773
0
            }
774
775
0
            break;
776
0
        }
777
0
        case '-': {
778
0
            skipChar(in);
779
0
            ret = parseDecimal(in, true);
780
0
            break;
781
0
        }
782
0
        case '+':
783
0
            skipChar(in);
784
        // fall through
785
1.06k
        default:
786
1.06k
            ret = parseDecimal(in);
787
1.06k
            break;
788
1.06k
        }
789
790
1.06k
        return ret;
791
1.06k
    }
792
793
    // parse a number in hex format
794
0
    bool parseHex(std::istream& in) {
795
0
        uint64_t val = 0;
796
0
        int num_digits;
797
0
        if (0 == (num_digits = parseHexHelper(in, val))) {
798
0
            err_ = JsonbErrType::E_INVALID_HEX;
799
0
            return false;
800
0
        }
801
802
0
        int size = 0;
803
0
        if (num_digits <= 2) {
804
0
            size = writer_.writeInt8((int8_t)val);
805
0
        } else if (num_digits <= 4) {
806
0
            size = writer_.writeInt16((int16_t)val);
807
0
        } else if (num_digits <= 8) {
808
0
            size = writer_.writeInt32((int32_t)val);
809
0
        } else if (num_digits <= 16) {
810
0
            size = writer_.writeInt64(val);
811
0
        } else {
812
0
            err_ = JsonbErrType::E_HEX_OVERFLOW;
813
0
            return false;
814
0
        }
815
816
0
        if (size == 0) {
817
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
818
0
            return false;
819
0
        }
820
821
0
        return true;
822
0
    }
823
824
    // parse a number in octal format
825
0
    bool parseOctal(std::istream& in) {
826
0
        int64_t val = 0;
827
0
        char ch = in.peek();
828
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
829
0
            if (ch >= '0' && ch <= '7') {
830
0
                val = val * 8 + (ch - '0');
831
0
            } else {
832
0
                err_ = JsonbErrType::E_INVALID_OCTAL;
833
0
                return false;
834
0
            }
835
836
            // check if the number overflows
837
0
            if (val < 0) {
838
0
                err_ = JsonbErrType::E_OCTAL_OVERFLOW;
839
0
                return false;
840
0
            }
841
842
0
            skipChar(in);
843
0
            ch = in.peek();
844
0
        }
845
846
0
        int size = 0;
847
0
        if (val <= std::numeric_limits<int8_t>::max()) {
848
0
            size = writer_.writeInt8((int8_t)val);
849
0
        } else if (val <= std::numeric_limits<int16_t>::max()) {
850
0
            size = writer_.writeInt16((int16_t)val);
851
0
        } else if (val <= std::numeric_limits<int32_t>::max()) {
852
0
            size = writer_.writeInt32((int32_t)val);
853
0
        } else { // val <= INT64_MAX
854
0
            size = writer_.writeInt64(val);
855
0
        }
856
857
0
        if (size == 0) {
858
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
859
0
            return false;
860
0
        }
861
862
0
        return true;
863
0
    }
864
865
    // parse a number in decimal (including float)
866
1.06k
    bool parseDecimal(std::istream& in, bool neg = false) {
867
1.06k
        char ch = 0;
868
1.06k
        while (in.good() && (ch = in.peek()) == '0') skipChar(in);
869
870
1.06k
        char* pbuf = num_buf_;
871
1.06k
        if (neg) *(pbuf++) = '-';
872
873
1.06k
        char* save_pos = pbuf;
874
4.69k
        while (in.good() && !strchr(kJsonDelim, ch)) {
875
3.99k
            *(pbuf++) = ch;
876
3.99k
            if (pbuf == end_buf_) {
877
0
                err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
878
0
                return false;
879
0
            }
880
881
3.99k
            if (ch == '.') {
882
261
                skipChar(in); // remove '.'
883
261
                return parseDouble(in, pbuf);
884
3.72k
            } else if (ch == 'E' || ch == 'e') {
885
0
                skipChar(in); // remove 'E'
886
0
                return parseExponent(in, pbuf);
887
3.72k
            } else if (ch < '0' || ch > '9') {
888
92
                err_ = JsonbErrType::E_INVALID_DECIMAL;
889
92
                return false;
890
92
            }
891
892
3.63k
            skipChar(in);
893
3.63k
            ch = in.peek();
894
3.63k
        }
895
707
        if (save_pos == pbuf) {
896
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
897
0
            return false;
898
0
        }
899
900
707
        *pbuf = 0; // set null-terminator
901
707
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
902
707
        int128_t val =
903
707
                StringParser::string_to_int<int128_t>(num_buf_, pbuf - num_buf_, &parse_result);
904
707
        if (parse_result != StringParser::PARSE_SUCCESS) {
905
0
            VLOG_ROW << "debug string_to_int error for " << num_buf_ << " val=" << val
906
0
                     << " parse_result=" << parse_result;
907
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
908
0
            return false;
909
0
        }
910
911
707
        int size = 0;
912
707
        if (val >= std::numeric_limits<int8_t>::min() &&
913
707
            val <= std::numeric_limits<int8_t>::max()) {
914
329
            size = writer_.writeInt8((int8_t)val);
915
378
        } else if (val >= std::numeric_limits<int16_t>::min() &&
916
378
                   val <= std::numeric_limits<int16_t>::max()) {
917
266
            size = writer_.writeInt16((int16_t)val);
918
266
        } else if (val >= std::numeric_limits<int32_t>::min() &&
919
112
                   val <= std::numeric_limits<int32_t>::max()) {
920
56
            size = writer_.writeInt32((int32_t)val);
921
56
        } else if (val >= std::numeric_limits<int64_t>::min() &&
922
56
                   val <= std::numeric_limits<int64_t>::max()) {
923
56
            size = writer_.writeInt64((int64_t)val);
924
56
        } else { // INT128
925
0
            size = writer_.writeInt128(val);
926
0
        }
927
928
707
        if (size == 0) {
929
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
930
0
            return false;
931
0
        }
932
933
707
        return true;
934
707
    }
935
936
    // parse IEEE745 double precision
937
261
    bool parseDouble(std::istream& in, char* pbuf) {
938
261
        char* save_pos = pbuf;
939
261
        char ch = in.peek();
940
761
        while (in.good() && !strchr(kJsonDelim, ch)) {
941
511
            *(pbuf++) = ch;
942
511
            if (pbuf == end_buf_) {
943
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
944
0
                return false;
945
0
            }
946
947
511
            if (ch == 'e' || ch == 'E') {
948
0
                skipChar(in); // remove 'E'
949
0
                return parseExponent(in, pbuf);
950
511
            } else if (ch < '0' || ch > '9') {
951
11
                err_ = JsonbErrType::E_INVALID_DECIMAL;
952
11
                return false;
953
11
            }
954
955
500
            skipChar(in);
956
500
            ch = in.peek();
957
500
        }
958
250
        if (save_pos == pbuf) {
959
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
960
0
            return false;
961
0
        }
962
963
250
        *pbuf = 0; // set null-terminator
964
250
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
965
250
    }
966
967
    // parse the exponent part of a double number
968
0
    bool parseExponent(std::istream& in, char* pbuf) {
969
0
        char ch = in.peek();
970
0
        if (in.good()) {
971
0
            if (ch == '+' || ch == '-') {
972
0
                *(pbuf++) = ch;
973
0
                if (pbuf == end_buf_) {
974
0
                    err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
975
0
                    return false;
976
0
                }
977
0
                skipChar(in);
978
0
                ch = in.peek();
979
0
            }
980
0
        }
981
982
0
        char* save_pos = pbuf;
983
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
984
0
            *(pbuf++) = ch;
985
0
            if (pbuf == end_buf_) {
986
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
987
0
                return false;
988
0
            }
989
990
0
            if (ch < '0' || ch > '9') {
991
0
                err_ = JsonbErrType::E_INVALID_EXPONENT;
992
0
                return false;
993
0
            }
994
995
0
            skipChar(in);
996
0
            ch = in.peek();
997
0
        }
998
0
        if (save_pos == pbuf) {
999
0
            err_ = JsonbErrType::E_INVALID_EXPONENT; // empty input
1000
0
            return false;
1001
0
        }
1002
1003
0
        *pbuf = 0; // set null-terminator
1004
0
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
1005
0
    }
1006
1007
    // call system function to parse double to string
1008
250
    bool internConvertBufferToDouble(char* num_buf_, int len) {
1009
250
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
1010
250
        double val = StringParser::string_to_float<double>(num_buf_, len, &parse_result);
1011
250
        if (parse_result != StringParser::PARSE_SUCCESS) {
1012
0
            VLOG_ROW << "debug string_to_float error for " << num_buf_ << " val=" << val
1013
0
                     << " parse_result=" << parse_result;
1014
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
1015
0
            return false;
1016
0
        }
1017
1018
250
        if (writer_.writeDouble(val) == 0) {
1019
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
1020
0
            return false;
1021
0
        }
1022
1023
250
        return true;
1024
250
    }
1025
1026
5.94k
    void trim(std::istream& in) {
1027
7.15k
        while (in.good() && strchr(kWhiteSpace, in.peek())) {
1028
1.20k
            skipChar(in);
1029
1.20k
        }
1030
5.94k
    }
1031
1032
    /*
1033
   * Helper functions to keep track of characters read.
1034
   * Do not rely on std::istream's tellg() which may not be implemented.
1035
   */
1036
1037
6.29k
    char nextChar(std::istream& in) {
1038
6.29k
        ++stream_pos_;
1039
6.29k
        return in.get();
1040
6.29k
    }
1041
1042
7.61k
    void skipChar(std::istream& in) {
1043
7.61k
        ++stream_pos_;
1044
7.61k
        in.ignore();
1045
7.61k
    }
1046
1047
private:
1048
    JsonbWriterT<OS_TYPE> writer_;
1049
    uint32_t stream_pos_;
1050
    JsonbErrType err_;
1051
    char num_buf_[512]; // buffer to hold number string
1052
    const char* end_buf_ = num_buf_ + sizeof(num_buf_) - 1;
1053
    uint32_t nesting_lvl_ = 0;
1054
};
1055
1056
typedef JsonbParserT<JsonbOutStream> JsonbParser;
1057
1058
} // namespace doris
1059
1060
#endif // JSONB_JSONBJSONPARSER_H