Coverage Report

Created: 2025-03-10 19:30

/root/doris/be/src/util/jsonb_parser.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserT (template) and JsonbParser.
13
 *
14
 * JsonbParserT is a template class which implements a JSON parser.
15
 * JsonbParserT parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserT creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserT with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserT parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserT object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#ifndef JSONB_JSONBJSONPARSER_H
60
#define JSONB_JSONBJSONPARSER_H
61
62
#include <cmath>
63
#include <limits>
64
65
#include "jsonb_document.h"
66
#include "jsonb_error.h"
67
#include "jsonb_writer.h"
68
#include "string_parser.hpp"
69
70
namespace doris {
71
72
const char* const kJsonDelim = " ,]}\t\r\n";
73
const char* const kWhiteSpace = " \t\n\r";
74
75
/*
76
 * Template JsonbParserT
77
 */
78
template <class OS_TYPE>
79
class JsonbParserT {
80
public:
81
897
    JsonbParserT() : stream_pos_(0), err_(JsonbErrType::E_NONE) {}
82
83
    explicit JsonbParserT(OS_TYPE& os) : writer_(os), stream_pos_(0), err_(JsonbErrType::E_NONE) {}
84
85
    // parse a UTF-8 JSON string
86
    bool parse(const std::string& str, hDictInsert handler = nullptr) {
87
        return parse(str.c_str(), (unsigned int)str.size(), handler);
88
    }
89
90
    // parse a UTF-8 JSON c-style string (NULL terminated)
91
    bool parse(const char* c_str, hDictInsert handler = nullptr) {
92
        return parse(c_str, (unsigned int)strlen(c_str), handler);
93
    }
94
95
    // parse a UTF-8 JSON string with length
96
1.09k
    bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
97
1.09k
        if (!pch || len == 0) {
98
0
            err_ = JsonbErrType::E_EMPTY_DOCUMENT;
99
0
            return false;
100
0
        }
101
102
1.09k
        JsonbInBuffer sb(pch, len);
103
1.09k
        std::istream in(&sb);
104
1.09k
        return parse(in, handler);
105
1.09k
    }
106
107
    // parse UTF-8 JSON text from an input stream
108
1.09k
    bool parse(std::istream& in, hDictInsert handler = nullptr) {
109
1.09k
        bool res = false;
110
1.09k
        err_ = JsonbErrType::E_NONE;
111
1.09k
        stream_pos_ = 0;
112
113
        // reset output stream
114
1.09k
        writer_.reset();
115
116
1.09k
        trim(in);
117
118
        // TODO(wzy): parsePrimitive should be implemented
119
1.09k
        if (in.peek() == '{') {
120
142
            skipChar(in);
121
142
            res = parseObject(in, handler);
122
957
        } else if (in.peek() == '[') {
123
391
            skipChar(in);
124
391
            res = parseArray(in, handler);
125
566
        } else {
126
566
            res = parsePrimitive(in, handler);
127
566
            if (!res) err_ = handle_parse_failure(in);
128
566
        }
129
130
1.09k
        trim(in);
131
1.09k
        if (res && !in.eof()) {
132
0
            err_ = JsonbErrType::E_INVALID_DOCU;
133
0
            return false;
134
0
        }
135
136
1.09k
        return res;
137
1.09k
    }
138
139
2.05k
    JsonbWriterT<OS_TYPE>& getWriter() { return writer_; }
140
141
70
    JsonbErrType getErrorCode() { return err_; }
142
143
    JsonbErrInfo getErrorInfo() {
144
        assert(err_ < JsonbErrType::E_NUM_ERRORS);
145
146
        JsonbErrInfo err_info;
147
148
        // stream_pos_ always points to the next char, so err_pos is 1-based
149
        err_info.err_pos = stream_pos_;
150
        err_info.err_msg = JsonbErrMsg::getErrMsg(err_);
151
152
        return err_info;
153
    }
154
155
    // clear error code
156
    void clearErr() { err_ = JsonbErrType::E_NONE; }
157
158
private:
159
0
    JsonbErrType handle_parse_value_failure(bool parse_res, std::istream& in) {
160
0
        if (parse_res) {
161
0
            trim(in);
162
0
            if (!in.good()) {
163
0
                return JsonbErrType::E_INVALID_DOCU_COMPAT;
164
0
            }
165
0
        }
166
0
        return JsonbErrType::E_INVALID_DOCU;
167
0
        ;
168
0
    }
169
170
    // In case json is determined to be invalid at top level,
171
    // try to parse literal values.
172
    // We return a different error code E_INVALID_DOCU_COMPAT
173
    // in case the input json contains these values.
174
    // Returning a different error code will cause an
175
    // auditing on the caller.
176
    // This is mainly done because 8.0 JSON_VALID considers
177
    // this as a valid input.
178
48
    JsonbErrType handle_parse_failure(std::istream& in) {
179
48
        JsonbErrType error = JsonbErrType::E_INVALID_DOCU;
180
48
        if (!writer_.writeStartArray()) {
181
0
            return error;
182
0
        }
183
184
48
        switch (in.peek()) {
185
0
        case 'n':
186
0
            skipChar(in);
187
0
            error = handle_parse_value_failure(parseNull(in), in);
188
0
            break;
189
0
        case 't':
190
0
            skipChar(in);
191
0
            error = handle_parse_value_failure(parseTrue(in), in);
192
0
            break;
193
0
        case 'f':
194
0
            skipChar(in);
195
0
            error = handle_parse_value_failure(parseFalse(in), in);
196
0
            break;
197
0
        case '"':
198
0
            skipChar(in);
199
0
            error = handle_parse_value_failure(parseString(in), in);
200
0
            break;
201
48
        default:
202
48
            if (parseNumber(in)) {
203
0
                trim(in);
204
0
                if (in.eof()) {
205
0
                    error = JsonbErrType::E_INVALID_DOCU_COMPAT;
206
0
                }
207
0
            }
208
48
        }
209
48
        if (!writer_.writeEndArray()) {
210
0
            return error;
211
0
        }
212
213
48
        return error;
214
48
    }
215
216
    // parse primitive
217
566
    bool parsePrimitive(std::istream& in, hDictInsert handler) {
218
566
        bool res = false;
219
566
        switch (in.peek()) {
220
56
        case 'n':
221
56
            skipChar(in);
222
56
            res = parseNull(in);
223
56
            break;
224
56
        case 't':
225
56
            skipChar(in);
226
56
            res = parseTrue(in);
227
56
            break;
228
56
        case 'f':
229
56
            skipChar(in);
230
56
            res = parseFalse(in);
231
56
            break;
232
64
        case '"':
233
64
            skipChar(in);
234
64
            res = parseString(in);
235
64
            break;
236
334
        default:
237
334
            res = parseNumber(in);
238
566
        }
239
240
566
        return res;
241
566
    }
242
243
    // parse a JSON object (comma-separated list of key-value pairs)
244
233
    bool parseObject(std::istream& in, hDictInsert handler) {
245
233
        if (!writer_.writeStartObject()) {
246
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
247
0
            return false;
248
0
        }
249
250
233
        trim(in);
251
252
233
        if (in.peek() == '}') {
253
59
            skipChar(in);
254
            // empty object
255
59
            if (!writer_.writeEndObject()) {
256
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
257
0
                return false;
258
0
            }
259
59
            return true;
260
59
        }
261
262
319
        while (in.good()) {
263
319
            if (nextChar(in) != '"') {
264
11
                err_ = JsonbErrType::E_INVALID_OBJ;
265
11
                return false;
266
11
            }
267
268
308
            if (!parseKVPair(in, handler)) {
269
0
                return false;
270
0
            }
271
272
308
            trim(in);
273
274
308
            char ch = nextChar(in);
275
308
            if (ch == '}') {
276
                // end of the object
277
163
                if (!writer_.writeEndObject()) {
278
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
279
0
                    return false;
280
0
                }
281
163
                return true;
282
163
            } else if (ch != ',') {
283
0
                err_ = JsonbErrType::E_INVALID_OBJ;
284
0
                return false;
285
0
            }
286
287
145
            trim(in);
288
145
        }
289
290
0
        err_ = JsonbErrType::E_INVALID_OBJ;
291
0
        return false;
292
174
    }
293
294
    // parse a JSON array (comma-separated list of values)
295
393
    bool parseArray(std::istream& in, hDictInsert handler) {
296
393
        if (!writer_.writeStartArray()) {
297
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
298
0
            return false;
299
0
        }
300
301
393
        trim(in);
302
303
393
        if (in.peek() == ']') {
304
56
            skipChar(in);
305
            // empty array
306
56
            if (!writer_.writeEndArray()) {
307
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
308
0
                return false;
309
0
            }
310
56
            return true;
311
56
        }
312
313
1.26k
        while (in.good()) {
314
1.26k
            if (!parseValue(in, handler)) {
315
11
                return false;
316
11
            }
317
318
1.25k
            trim(in);
319
320
1.25k
            char ch = nextChar(in);
321
1.25k
            if (ch == ']') {
322
                // end of the array
323
326
                if (!writer_.writeEndArray()) {
324
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
325
0
                    return false;
326
0
                }
327
326
                return true;
328
925
            } else if (ch != ',') {
329
0
                err_ = JsonbErrType::E_INVALID_ARR;
330
0
                return false;
331
0
            }
332
333
925
            trim(in);
334
925
        }
335
336
0
        err_ = JsonbErrType::E_INVALID_ARR;
337
0
        return false;
338
337
    }
339
340
    // parse a key-value pair, separated by ":"
341
308
    bool parseKVPair(std::istream& in, hDictInsert handler) {
342
308
        if (parseKey(in, handler) && parseValue(in, handler)) {
343
308
            return true;
344
308
        }
345
346
0
        return false;
347
308
    }
348
349
    // parse a key (must be string)
350
308
    bool parseKey(std::istream& in, hDictInsert handler) {
351
308
        char key[JsonbKeyValue::sMaxKeyLen];
352
308
        int key_len = 0;
353
964
        while (in.good() && in.peek() != '"' && key_len < JsonbKeyValue::sMaxKeyLen) {
354
656
            char ch = nextChar(in);
355
656
            if (ch == '\\') {
356
0
                char escape_buffer[5]; // buffer for escape
357
0
                int len;
358
0
                if (!parseEscape(in, escape_buffer, len)) {
359
0
                    err_ = JsonbErrType::E_INVALID_KEY_STRING;
360
0
                    return false;
361
0
                }
362
0
                if (key_len + len >= JsonbKeyValue::sMaxKeyLen) {
363
0
                    err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
364
0
                    return false;
365
0
                }
366
0
                memcpy(key + key_len, escape_buffer, len);
367
0
                key_len += len;
368
656
            } else {
369
656
                key[key_len++] = ch;
370
656
            }
371
656
        }
372
373
308
        if (!in.good() || in.peek() != '"' || key_len == 0) {
374
0
            if (key_len == JsonbKeyValue::sMaxKeyLen)
375
0
                err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
376
0
            else
377
0
                err_ = JsonbErrType::E_INVALID_KEY_STRING;
378
0
            return false;
379
0
        }
380
381
308
        skipChar(in); // discard '"'
382
383
308
        int key_id = -1;
384
308
        if (handler) {
385
0
            key_id = handler(key, key_len);
386
0
        }
387
388
308
        if (key_id < 0) {
389
308
            writer_.writeKey(key, key_len);
390
308
        } else {
391
0
            writer_.writeKey(key_id);
392
0
        }
393
394
308
        trim(in);
395
396
308
        if (nextChar(in) != ':') {
397
0
            err_ = JsonbErrType::E_INVALID_OBJ;
398
0
            return false;
399
0
        }
400
401
308
        trim(in);
402
308
        if (!in.good()) {
403
0
            err_ = JsonbErrType::E_INVALID_OBJ;
404
0
            return false;
405
0
        }
406
407
308
        return true;
408
308
    }
409
410
    // parse a value
411
1.57k
    bool parseValue(std::istream& in, hDictInsert handler) {
412
1.57k
        bool res = false;
413
414
1.57k
        switch (in.peek()) {
415
0
        case 'N':
416
102
        case 'n': {
417
102
            skipChar(in);
418
102
            res = parseNull(in);
419
102
            break;
420
0
        }
421
0
        case 'T':
422
106
        case 't': {
423
106
            skipChar(in);
424
106
            res = parseTrue(in);
425
106
            break;
426
0
        }
427
0
        case 'F':
428
102
        case 'f': {
429
102
            skipChar(in);
430
102
            res = parseFalse(in);
431
102
            break;
432
0
        }
433
475
        case '"': {
434
475
            skipChar(in);
435
475
            res = parseString(in);
436
475
            break;
437
0
        }
438
91
        case '{': {
439
91
            skipChar(in);
440
91
            ++nesting_lvl_;
441
91
            if (nesting_lvl_ >= MaxNestingLevel) {
442
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
443
0
                return false;
444
0
            }
445
91
            res = parseObject(in, handler);
446
91
            if (res) {
447
91
                --nesting_lvl_;
448
91
            }
449
91
            break;
450
91
        }
451
2
        case '[': {
452
2
            skipChar(in);
453
2
            ++nesting_lvl_;
454
2
            if (nesting_lvl_ >= MaxNestingLevel) {
455
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
456
0
                return false;
457
0
            }
458
2
            res = parseArray(in, handler);
459
2
            if (res) {
460
2
                --nesting_lvl_;
461
2
            }
462
2
            break;
463
2
        }
464
692
        default: {
465
692
            res = parseNumber(in);
466
692
            break;
467
2
        }
468
1.57k
        }
469
470
1.57k
        return res;
471
1.57k
    }
472
473
    // parse NULL value
474
158
    bool parseNull(std::istream& in) {
475
158
        if (tolower(nextChar(in)) == 'u' && tolower(nextChar(in)) == 'l' &&
476
158
            tolower(nextChar(in)) == 'l') {
477
158
            writer_.writeNull();
478
158
            return true;
479
158
        }
480
481
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
482
0
        return false;
483
158
    }
484
485
    // parse TRUE value
486
162
    bool parseTrue(std::istream& in) {
487
162
        if (tolower(nextChar(in)) == 'r' && tolower(nextChar(in)) == 'u' &&
488
162
            tolower(nextChar(in)) == 'e') {
489
162
            writer_.writeBool(true);
490
162
            return true;
491
162
        }
492
493
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
494
0
        return false;
495
162
    }
496
497
    // parse FALSE value
498
158
    bool parseFalse(std::istream& in) {
499
158
        if (tolower(nextChar(in)) == 'a' && tolower(nextChar(in)) == 'l' &&
500
158
            tolower(nextChar(in)) == 's' && tolower(nextChar(in)) == 'e') {
501
158
            writer_.writeBool(false);
502
158
            return true;
503
158
        }
504
505
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
506
0
        return false;
507
158
    }
508
509
    /*
510
    This is a helper function to parse the hex value. hex_num means the
511
    number of digits needed to be parsed. If less than zero, then it will
512
    consider all the characters between current and any character in JsonDelim.
513
  */
514
0
    unsigned parseHexHelper(std::istream& in, uint64_t& val, unsigned hex_num = 17) {
515
        // We can't read more than 17 digits, so when read 17 digits, it's overflow
516
0
        val = 0;
517
0
        unsigned num_digits = 0;
518
0
        char ch = tolower(in.peek());
519
0
        while (in.good() && !strchr(kJsonDelim, ch) && num_digits != hex_num) {
520
0
            if (ch >= '0' && ch <= '9') {
521
0
                val = (val << 4) + (ch - '0');
522
0
            } else if (ch >= 'a' && ch <= 'f') {
523
0
                val = (val << 4) + (ch - 'a' + 10);
524
0
            } else {
525
                // unrecognized hex digit
526
0
                return 0;
527
0
            }
528
0
            skipChar(in);
529
0
            ch = tolower(in.peek());
530
0
            ++num_digits;
531
0
        }
532
0
        return num_digits;
533
0
    }
534
535
    // parse HEX value
536
0
    bool parseHex4(std::istream& in, unsigned& h) {
537
0
        uint64_t val;
538
0
        if (4 == parseHexHelper(in, val, 4)) {
539
0
            h = (unsigned)val;
540
0
            return true;
541
0
        }
542
0
        return false;
543
0
    }
544
545
    /*
546
     parse Escape char.
547
  */
548
0
    bool parseEscape(std::istream& in, char* out, int& len) {
549
        /*
550
      This is extracted from cJSON implementation.
551
      This is about the mask of the first byte in UTF-8.
552
      The mask is defined in:
553
      http://en.wikipedia.org/wiki/UTF-8#Description
554
    */
555
0
        const unsigned char firstByteMark[6] = {0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
556
0
        if (!in.good()) {
557
0
            return false;
558
0
        }
559
0
        char c = nextChar(in);
560
0
        len = 1;
561
0
        switch (c) {
562
        // \" \\ \/  \b \f \n \r \t
563
0
        case '"':
564
0
            *out = '"';
565
0
            return true;
566
0
        case '\\':
567
0
            *out = '\\';
568
0
            return true;
569
0
        case '/':
570
0
            *out = '/';
571
0
            return true;
572
0
        case 'b':
573
0
            *out = '\b';
574
0
            return true;
575
0
        case 'f':
576
0
            *out = '\f';
577
0
            return true;
578
0
        case 'n':
579
0
            *out = '\n';
580
0
            return true;
581
0
        case 'r':
582
0
            *out = '\r';
583
0
            return true;
584
0
        case 't':
585
0
            *out = '\t';
586
0
            return true;
587
0
        case 'u': {
588
0
            unsigned uc;
589
0
            if (!parseHex4(in, uc)) {
590
0
                return false;
591
0
            }
592
            /*
593
          For DC00 to DFFF, it should be low surrogates for UTF16.
594
          So if it display in the high bits, it's invalid.
595
        */
596
0
            if (uc >= 0xDC00 && uc <= 0xDFFF) {
597
0
                return false;
598
0
            }
599
600
            /*
601
          For D800 to DBFF, it's the high surrogates for UTF16.
602
          So it's utf-16, there must be another one between 0xDC00
603
          and 0xDFFF.
604
        */
605
0
            if (uc >= 0xD800 && uc <= 0xDBFF) {
606
0
                unsigned uc2;
607
608
0
                if (!in.good()) {
609
0
                    return false;
610
0
                }
611
0
                c = nextChar(in);
612
0
                if (c != '\\') {
613
0
                    return false;
614
0
                }
615
616
0
                if (!in.good()) {
617
0
                    return false;
618
0
                }
619
0
                c = nextChar(in);
620
0
                if (c != 'u') {
621
0
                    return false;
622
0
                }
623
624
0
                if (!parseHex4(in, uc2)) {
625
0
                    return false;
626
0
                }
627
                /*
628
            Now we need the low surrogates for UTF16. It should be
629
            within 0xDC00 and 0xDFFF.
630
          */
631
0
                if (uc2 < 0xDC00 || uc2 > 0xDFFF) return false;
632
                /*
633
            For the character that not in the Basic Multilingual Plan,
634
            it's represented as twelve-character, encoding the UTF-16
635
            surrogate pair.
636
            UTF16 is between 0x10000 and 0x10FFFF. The high surrogate
637
            present the high bits and the low surrogate present the
638
            lower 10 bits.
639
            For detailed explanation, please refer to:
640
            http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
641
            Then it will be converted to UTF8.
642
          */
643
0
                uc = 0x10000 + (((uc & 0x3FF) << 10) | (uc2 & 0x3FF));
644
0
            }
645
646
            /*
647
          Get the length of the unicode.
648
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
649
        */
650
0
            if (uc < 0x80)
651
0
                len = 1;
652
0
            else if (uc < 0x800)
653
0
                len = 2;
654
0
            else if (uc < 0x10000)
655
0
                len = 3;
656
0
            else
657
0
                len = 4;
658
0
            out += len;
659
            /*
660
          Encode it.
661
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
662
          This part of code has a reference to cJSON.
663
        */
664
0
            switch (len) {
665
0
            case 4:
666
0
                *--out = ((uc | 0x80) & 0xBF);
667
0
                uc >>= 6;
668
0
                [[fallthrough]];
669
0
            case 3:
670
0
                *--out = ((uc | 0x80) & 0xBF);
671
0
                uc >>= 6;
672
0
                [[fallthrough]];
673
0
            case 2:
674
0
                *--out = ((uc | 0x80) & 0xBF);
675
0
                uc >>= 6;
676
0
                [[fallthrough]];
677
0
            case 1:
678
                // Mask the first byte according to the standard.
679
0
                *--out = (uc | firstByteMark[len - 1]);
680
0
            }
681
0
            return true;
682
0
            break;
683
0
        }
684
0
        default:
685
0
            return false;
686
0
            break;
687
0
        }
688
0
    }
689
690
    // parse a string
691
539
    bool parseString(std::istream& in) {
692
539
        const int BUFFER_LEN = 4096;
693
539
        if (!writer_.writeStartString()) {
694
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
695
0
            return false;
696
0
        }
697
698
        // write 4KB at a time
699
539
        char buffer[BUFFER_LEN];
700
539
        int nread = 0;
701
2.05k
        while (in.good()) {
702
2.05k
            char ch = nextChar(in);
703
2.05k
            if (ch == '"') {
704
                // write all remaining bytes in the buffer
705
539
                if (nread > 0) {
706
539
                    if (!writer_.writeString(buffer, nread)) {
707
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
708
0
                        return false;
709
0
                    }
710
539
                }
711
                // end writing string
712
539
                if (!writer_.writeEndString()) {
713
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
714
0
                    return false;
715
0
                }
716
539
                return true;
717
1.51k
            } else if (ch == '\\') {
718
                // this is a escape char
719
0
                char escape_buffer[5]; // buffer for escape
720
0
                int len;
721
0
                if (!parseEscape(in, escape_buffer, len)) {
722
0
                    err_ = JsonbErrType::E_INVALID_STR;
723
0
                    return false;
724
0
                }
725
726
                // Write each char to the buffer
727
0
                for (int i = 0; i != len; ++i) {
728
0
                    buffer[nread++] = escape_buffer[i];
729
0
                    if (nread == BUFFER_LEN) {
730
0
                        if (!writer_.writeString(buffer, nread)) {
731
0
                            err_ = JsonbErrType::E_OUTPUT_FAIL;
732
0
                            return false;
733
0
                        }
734
0
                        nread = 0;
735
0
                    }
736
0
                }
737
1.51k
            } else {
738
                // just a char
739
1.51k
                buffer[nread++] = ch;
740
1.51k
                if (nread == BUFFER_LEN) {
741
                    // flush buffer
742
0
                    if (!writer_.writeString(buffer, nread)) {
743
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
744
0
                        return false;
745
0
                    }
746
0
                    nread = 0;
747
0
                }
748
1.51k
            }
749
2.05k
        }
750
751
0
        err_ = JsonbErrType::E_INVALID_STR;
752
0
        return false;
753
539
    }
754
755
    // parse a number
756
    // Number format can be hex, octal, or decimal (including float).
757
    // Only decimal can have (+/-) sign prefix.
758
1.07k
    bool parseNumber(std::istream& in) {
759
1.07k
        bool ret = false;
760
1.07k
        switch (in.peek()) {
761
0
        case '0': {
762
0
            skipChar(in);
763
764
0
            if (in.peek() == 'x' || in.peek() == 'X') {
765
0
                skipChar(in);
766
0
                ret = parseHex(in);
767
0
            } else if (in.peek() == '.') {
768
0
                skipChar(in); // remove '.'
769
0
                num_buf_[0] = '.';
770
0
                ret = parseDouble(in, num_buf_ + 1);
771
0
            } else {
772
0
                ret = parseOctal(in);
773
0
            }
774
775
0
            break;
776
0
        }
777
0
        case '-': {
778
0
            skipChar(in);
779
0
            ret = parseDecimal(in, true);
780
0
            break;
781
0
        }
782
0
        case '+':
783
0
            skipChar(in);
784
        // fall through
785
1.07k
        default:
786
1.07k
            ret = parseDecimal(in);
787
1.07k
            break;
788
1.07k
        }
789
790
1.07k
        return ret;
791
1.07k
    }
792
793
    // parse a number in hex format
794
0
    bool parseHex(std::istream& in) {
795
0
        uint64_t val = 0;
796
0
        int num_digits;
797
0
        if (0 == (num_digits = parseHexHelper(in, val))) {
798
0
            err_ = JsonbErrType::E_INVALID_HEX;
799
0
            return false;
800
0
        }
801
802
0
        int size = 0;
803
0
        if (num_digits <= 2) {
804
0
            size = writer_.writeInt8((int8_t)val);
805
0
        } else if (num_digits <= 4) {
806
0
            size = writer_.writeInt16((int16_t)val);
807
0
        } else if (num_digits <= 8) {
808
0
            size = writer_.writeInt32((int32_t)val);
809
0
        } else if (num_digits <= 16) {
810
0
            size = writer_.writeInt64(val);
811
0
        } else {
812
0
            err_ = JsonbErrType::E_HEX_OVERFLOW;
813
0
            return false;
814
0
        }
815
816
0
        if (size == 0) {
817
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
818
0
            return false;
819
0
        }
820
821
0
        return true;
822
0
    }
823
824
    // parse a number in octal format
825
0
    bool parseOctal(std::istream& in) {
826
0
        int64_t val = 0;
827
0
        char ch = in.peek();
828
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
829
0
            if (ch >= '0' && ch <= '7') {
830
0
                val = val * 8 + (ch - '0');
831
0
            } else {
832
0
                err_ = JsonbErrType::E_INVALID_OCTAL;
833
0
                return false;
834
0
            }
835
836
            // check if the number overflows
837
0
            if (val < 0) {
838
0
                err_ = JsonbErrType::E_OCTAL_OVERFLOW;
839
0
                return false;
840
0
            }
841
842
0
            skipChar(in);
843
0
            ch = in.peek();
844
0
        }
845
846
0
        int size = 0;
847
0
        if (val <= std::numeric_limits<int8_t>::max()) {
848
0
            size = writer_.writeInt8((int8_t)val);
849
0
        } else if (val <= std::numeric_limits<int16_t>::max()) {
850
0
            size = writer_.writeInt16((int16_t)val);
851
0
        } else if (val <= std::numeric_limits<int32_t>::max()) {
852
0
            size = writer_.writeInt32((int32_t)val);
853
0
        } else { // val <= INT64_MAX
854
0
            size = writer_.writeInt64(val);
855
0
        }
856
857
0
        if (size == 0) {
858
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
859
0
            return false;
860
0
        }
861
862
0
        return true;
863
0
    }
864
865
    // parse a number in decimal (including float)
866
1.07k
    bool parseDecimal(std::istream& in, bool neg = false) {
867
1.07k
        char ch = 0;
868
1.07k
        while (in.good() && (ch = in.peek()) == '0') skipChar(in);
869
870
1.07k
        char* pbuf = num_buf_;
871
1.07k
        if (neg) *(pbuf++) = '-';
872
873
1.07k
        char* save_pos = pbuf;
874
4.73k
        while (in.good() && !strchr(kJsonDelim, ch)) {
875
4.02k
            *(pbuf++) = ch;
876
4.02k
            if (pbuf == end_buf_) {
877
0
                err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
878
0
                return false;
879
0
            }
880
881
4.02k
            if (ch == '.') {
882
261
                skipChar(in); // remove '.'
883
261
                return parseDouble(in, pbuf);
884
3.75k
            } else if (ch == 'E' || ch == 'e') {
885
0
                skipChar(in); // remove 'E'
886
0
                return parseExponent(in, pbuf);
887
3.75k
            } else if (ch < '0' || ch > '9') {
888
96
                err_ = JsonbErrType::E_INVALID_DECIMAL;
889
96
                return false;
890
96
            }
891
892
3.66k
            skipChar(in);
893
3.66k
            ch = in.peek();
894
3.66k
        }
895
717
        if (save_pos == pbuf) {
896
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
897
0
            return false;
898
0
        }
899
900
717
        *pbuf = 0; // set null-terminator
901
717
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
902
717
        int128_t val =
903
717
                StringParser::string_to_int<int128_t>(num_buf_, pbuf - num_buf_, &parse_result);
904
717
        if (parse_result != StringParser::PARSE_SUCCESS) {
905
0
            VLOG_ROW << "debug string_to_int error for " << num_buf_ << " val=" << val
906
0
                     << " parse_result=" << parse_result;
907
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
908
0
            return false;
909
0
        }
910
911
717
        int size = 0;
912
717
        if (val >= std::numeric_limits<int8_t>::min() &&
913
717
            val <= std::numeric_limits<int8_t>::max()) {
914
335
            size = writer_.writeInt8((int8_t)val);
915
382
        } else if (val >= std::numeric_limits<int16_t>::min() &&
916
382
                   val <= std::numeric_limits<int16_t>::max()) {
917
270
            size = writer_.writeInt16((int16_t)val);
918
270
        } else if (val >= std::numeric_limits<int32_t>::min() &&
919
112
                   val <= std::numeric_limits<int32_t>::max()) {
920
56
            size = writer_.writeInt32((int32_t)val);
921
56
        } else if (val >= std::numeric_limits<int64_t>::min() &&
922
56
                   val <= std::numeric_limits<int64_t>::max()) {
923
56
            size = writer_.writeInt64((int64_t)val);
924
56
        } else { // INT128
925
0
            size = writer_.writeInt128(val);
926
0
        }
927
928
717
        if (size == 0) {
929
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
930
0
            return false;
931
0
        }
932
933
717
        return true;
934
717
    }
935
936
    // parse IEEE745 double precision
937
261
    bool parseDouble(std::istream& in, char* pbuf) {
938
261
        char* save_pos = pbuf;
939
261
        char ch = in.peek();
940
761
        while (in.good() && !strchr(kJsonDelim, ch)) {
941
511
            *(pbuf++) = ch;
942
511
            if (pbuf == end_buf_) {
943
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
944
0
                return false;
945
0
            }
946
947
511
            if (ch == 'e' || ch == 'E') {
948
0
                skipChar(in); // remove 'E'
949
0
                return parseExponent(in, pbuf);
950
511
            } else if (ch < '0' || ch > '9') {
951
11
                err_ = JsonbErrType::E_INVALID_DECIMAL;
952
11
                return false;
953
11
            }
954
955
500
            skipChar(in);
956
500
            ch = in.peek();
957
500
        }
958
250
        if (save_pos == pbuf) {
959
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
960
0
            return false;
961
0
        }
962
963
250
        *pbuf = 0; // set null-terminator
964
250
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
965
250
    }
966
967
    // parse the exponent part of a double number
968
0
    bool parseExponent(std::istream& in, char* pbuf) {
969
0
        char ch = in.peek();
970
0
        if (in.good()) {
971
0
            if (ch == '+' || ch == '-') {
972
0
                *(pbuf++) = ch;
973
0
                if (pbuf == end_buf_) {
974
0
                    err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
975
0
                    return false;
976
0
                }
977
0
                skipChar(in);
978
0
                ch = in.peek();
979
0
            }
980
0
        }
981
982
0
        char* save_pos = pbuf;
983
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
984
0
            *(pbuf++) = ch;
985
0
            if (pbuf == end_buf_) {
986
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
987
0
                return false;
988
0
            }
989
990
0
            if (ch < '0' || ch > '9') {
991
0
                err_ = JsonbErrType::E_INVALID_EXPONENT;
992
0
                return false;
993
0
            }
994
995
0
            skipChar(in);
996
0
            ch = in.peek();
997
0
        }
998
0
        if (save_pos == pbuf) {
999
0
            err_ = JsonbErrType::E_INVALID_EXPONENT; // empty input
1000
0
            return false;
1001
0
        }
1002
1003
0
        *pbuf = 0; // set null-terminator
1004
0
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
1005
0
    }
1006
1007
    // call system function to parse double to string
1008
250
    bool internConvertBufferToDouble(char* num_buf_, int len) {
1009
250
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
1010
250
        double val = StringParser::string_to_float<double>(num_buf_, len, &parse_result);
1011
250
        if (parse_result != StringParser::PARSE_SUCCESS) {
1012
0
            VLOG_ROW << "debug string_to_float error for " << num_buf_ << " val=" << val
1013
0
                     << " parse_result=" << parse_result;
1014
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
1015
0
            return false;
1016
0
        }
1017
1018
250
        if (writer_.writeDouble(val) == 0) {
1019
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
1020
0
            return false;
1021
0
        }
1022
1023
250
        return true;
1024
250
    }
1025
1026
6.06k
    void trim(std::istream& in) {
1027
7.29k
        while (in.good() && strchr(kWhiteSpace, in.peek())) {
1028
1.22k
            skipChar(in);
1029
1.22k
        }
1030
6.06k
    }
1031
1032
    /*
1033
   * Helper functions to keep track of characters read.
1034
   * Do not rely on std::istream's tellg() which may not be implemented.
1035
   */
1036
1037
6.49k
    char nextChar(std::istream& in) {
1038
6.49k
        ++stream_pos_;
1039
6.49k
        return in.get();
1040
6.49k
    }
1041
1042
7.71k
    void skipChar(std::istream& in) {
1043
7.71k
        ++stream_pos_;
1044
7.71k
        in.ignore();
1045
7.71k
    }
1046
1047
private:
1048
    JsonbWriterT<OS_TYPE> writer_;
1049
    uint32_t stream_pos_;
1050
    JsonbErrType err_;
1051
    char num_buf_[512]; // buffer to hold number string
1052
    const char* end_buf_ = num_buf_ + sizeof(num_buf_) - 1;
1053
    uint32_t nesting_lvl_ = 0;
1054
};
1055
1056
typedef JsonbParserT<JsonbOutStream> JsonbParser;
1057
1058
} // namespace doris
1059
1060
#endif // JSONB_JSONBJSONPARSER_H