Coverage Report

Created: 2025-03-10 18:45

/root/doris/be/src/util/jsonb_parser.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserT (template) and JsonbParser.
13
 *
14
 * JsonbParserT is a template class which implements a JSON parser.
15
 * JsonbParserT parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserT creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserT with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserT parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserT object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#ifndef JSONB_JSONBJSONPARSER_H
60
#define JSONB_JSONBJSONPARSER_H
61
62
#include <cmath>
63
#include <limits>
64
65
#include "jsonb_document.h"
66
#include "jsonb_error.h"
67
#include "jsonb_writer.h"
68
#include "string_parser.hpp"
69
70
namespace doris {
71
72
const char* const kJsonDelim = " ,]}\t\r\n";
73
const char* const kWhiteSpace = " \t\n\r";
74
75
/*
76
 * Template JsonbParserT
77
 */
78
template <class OS_TYPE>
79
class JsonbParserT {
80
public:
81
1.00k
    JsonbParserT() : stream_pos_(0), err_(JsonbErrType::E_NONE) {}
82
83
    explicit JsonbParserT(OS_TYPE& os) : writer_(os), stream_pos_(0), err_(JsonbErrType::E_NONE) {}
84
85
    // parse a UTF-8 JSON string
86
    bool parse(const std::string& str, hDictInsert handler = nullptr) {
87
        return parse(str.c_str(), str.size(), handler);
88
    }
89
90
    // parse a UTF-8 JSON c-style string (NULL terminated)
91
    bool parse(const char* c_str, hDictInsert handler = nullptr) {
92
        return parse(c_str, strlen(c_str), handler);
93
    }
94
95
    // parse a UTF-8 JSON string with length
96
1.21k
    bool parse(const char* pch, size_t len, hDictInsert handler = nullptr) {
97
1.21k
        if (!pch || len == 0) {
98
0
            err_ = JsonbErrType::E_EMPTY_DOCUMENT;
99
0
            return false;
100
0
        }
101
102
1.21k
        JsonbInBuffer sb(pch, len);
103
1.21k
        std::istream in(&sb);
104
1.21k
        return parse(in, handler);
105
1.21k
    }
106
107
    // parse UTF-8 JSON text from an input stream
108
1.21k
    bool parse(std::istream& in, hDictInsert handler = nullptr) {
109
1.21k
        bool res = false;
110
1.21k
        err_ = JsonbErrType::E_NONE;
111
1.21k
        stream_pos_ = 0;
112
113
        // reset output stream
114
1.21k
        writer_.reset();
115
116
1.21k
        trim(in);
117
118
        // TODO(wzy): parsePrimitive should be implemented
119
1.21k
        if (in.peek() == '{') {
120
156
            skipChar(in);
121
156
            res = parseObject(in, handler);
122
1.05k
        } else if (in.peek() == '[') {
123
426
            skipChar(in);
124
426
            res = parseArray(in, handler);
125
629
        } else {
126
629
            res = parsePrimitive(in, handler);
127
629
            if (!res) err_ = handle_parse_failure(in);
128
629
        }
129
130
1.21k
        trim(in);
131
1.21k
        if (res && !in.eof()) {
132
0
            err_ = JsonbErrType::E_INVALID_DOCU;
133
0
            return false;
134
0
        }
135
136
1.21k
        return res;
137
1.21k
    }
138
139
2.28k
    JsonbWriterT<OS_TYPE>& getWriter() { return writer_; }
140
141
70
    JsonbErrType getErrorCode() { return err_; }
142
143
    JsonbErrInfo getErrorInfo() {
144
        assert(err_ < JsonbErrType::E_NUM_ERRORS);
145
146
        JsonbErrInfo err_info;
147
148
        // stream_pos_ always points to the next char, so err_pos is 1-based
149
        err_info.err_pos = stream_pos_;
150
        err_info.err_msg = JsonbErrMsg::getErrMsg(err_);
151
152
        return err_info;
153
    }
154
155
    // clear error code
156
    void clearErr() { err_ = JsonbErrType::E_NONE; }
157
158
private:
159
0
    JsonbErrType handle_parse_value_failure(bool parse_res, std::istream& in) {
160
0
        if (parse_res) {
161
0
            trim(in);
162
0
            if (!in.good()) {
163
0
                return JsonbErrType::E_INVALID_DOCU_COMPAT;
164
0
            }
165
0
        }
166
0
        return JsonbErrType::E_INVALID_DOCU;
167
0
        ;
168
0
    }
169
170
    // In case json is determined to be invalid at top level,
171
    // try to parse literal values.
172
    // We return a different error code E_INVALID_DOCU_COMPAT
173
    // in case the input json contains these values.
174
    // Returning a different error code will cause an
175
    // auditing on the caller.
176
    // This is mainly done because 8.0 JSON_VALID considers
177
    // this as a valid input.
178
48
    JsonbErrType handle_parse_failure(std::istream& in) {
179
48
        JsonbErrType error = JsonbErrType::E_INVALID_DOCU;
180
48
        if (!writer_.writeStartArray()) {
181
0
            return error;
182
0
        }
183
184
48
        switch (in.peek()) {
185
0
        case 'n':
186
0
            skipChar(in);
187
0
            error = handle_parse_value_failure(parseNull(in), in);
188
0
            break;
189
0
        case 't':
190
0
            skipChar(in);
191
0
            error = handle_parse_value_failure(parseTrue(in), in);
192
0
            break;
193
0
        case 'f':
194
0
            skipChar(in);
195
0
            error = handle_parse_value_failure(parseFalse(in), in);
196
0
            break;
197
0
        case '"':
198
0
            skipChar(in);
199
0
            error = handle_parse_value_failure(parseString(in), in);
200
0
            break;
201
48
        default:
202
48
            if (parseNumber(in)) {
203
0
                trim(in);
204
0
                if (in.eof()) {
205
0
                    error = JsonbErrType::E_INVALID_DOCU_COMPAT;
206
0
                }
207
0
            }
208
48
        }
209
48
        if (!writer_.writeEndArray()) {
210
0
            return error;
211
0
        }
212
213
48
        return error;
214
48
    }
215
216
    // parse primitive
217
629
    bool parsePrimitive(std::istream& in, hDictInsert handler) {
218
629
        bool res = false;
219
629
        switch (in.peek()) {
220
63
        case 'n':
221
63
            skipChar(in);
222
63
            res = parseNull(in);
223
63
            break;
224
63
        case 't':
225
63
            skipChar(in);
226
63
            res = parseTrue(in);
227
63
            break;
228
63
        case 'f':
229
63
            skipChar(in);
230
63
            res = parseFalse(in);
231
63
            break;
232
71
        case '"':
233
71
            skipChar(in);
234
71
            res = parseString(in);
235
71
            break;
236
369
        default:
237
369
            res = parseNumber(in);
238
629
        }
239
240
629
        return res;
241
629
    }
242
243
    // parse a JSON object (comma-separated list of key-value pairs)
244
254
    bool parseObject(std::istream& in, hDictInsert handler) {
245
254
        if (!writer_.writeStartObject()) {
246
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
247
0
            return false;
248
0
        }
249
250
254
        trim(in);
251
252
254
        if (in.peek() == '}') {
253
66
            skipChar(in);
254
            // empty object
255
66
            if (!writer_.writeEndObject()) {
256
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
257
0
                return false;
258
0
            }
259
66
            return true;
260
66
        }
261
262
347
        while (in.good()) {
263
347
            if (nextChar(in) != '"') {
264
11
                err_ = JsonbErrType::E_INVALID_OBJ;
265
11
                return false;
266
11
            }
267
268
336
            if (!parseKVPair(in, handler)) {
269
0
                return false;
270
0
            }
271
272
336
            trim(in);
273
274
336
            char ch = nextChar(in);
275
336
            if (ch == '}') {
276
                // end of the object
277
177
                if (!writer_.writeEndObject()) {
278
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
279
0
                    return false;
280
0
                }
281
177
                return true;
282
177
            } else if (ch != ',') {
283
0
                err_ = JsonbErrType::E_INVALID_OBJ;
284
0
                return false;
285
0
            }
286
287
159
            trim(in);
288
159
        }
289
290
0
        err_ = JsonbErrType::E_INVALID_OBJ;
291
0
        return false;
292
188
    }
293
294
    // parse a JSON array (comma-separated list of values)
295
428
    bool parseArray(std::istream& in, hDictInsert handler) {
296
428
        if (!writer_.writeStartArray()) {
297
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
298
0
            return false;
299
0
        }
300
301
428
        trim(in);
302
303
428
        if (in.peek() == ']') {
304
63
            skipChar(in);
305
            // empty array
306
63
            if (!writer_.writeEndArray()) {
307
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
308
0
                return false;
309
0
            }
310
63
            return true;
311
63
        }
312
313
1.36k
        while (in.good()) {
314
1.36k
            if (!parseValue(in, handler)) {
315
11
                return false;
316
11
            }
317
318
1.34k
            trim(in);
319
320
1.34k
            char ch = nextChar(in);
321
1.34k
            if (ch == ']') {
322
                // end of the array
323
354
                if (!writer_.writeEndArray()) {
324
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
325
0
                    return false;
326
0
                }
327
354
                return true;
328
995
            } else if (ch != ',') {
329
0
                err_ = JsonbErrType::E_INVALID_ARR;
330
0
                return false;
331
0
            }
332
333
995
            trim(in);
334
995
        }
335
336
0
        err_ = JsonbErrType::E_INVALID_ARR;
337
0
        return false;
338
365
    }
339
340
    // parse a key-value pair, separated by ":"
341
336
    bool parseKVPair(std::istream& in, hDictInsert handler) {
342
336
        if (parseKey(in, handler) && parseValue(in, handler)) {
343
336
            return true;
344
336
        }
345
346
0
        return false;
347
336
    }
348
349
    // parse a key (must be string)
350
336
    bool parseKey(std::istream& in, hDictInsert handler) {
351
336
        char key[JsonbKeyValue::sMaxKeyLen];
352
336
        int key_len = 0;
353
1.04k
        while (in.good() && in.peek() != '"' && key_len < JsonbKeyValue::sMaxKeyLen) {
354
712
            char ch = nextChar(in);
355
712
            if (ch == '\\') {
356
0
                char escape_buffer[5]; // buffer for escape
357
0
                int len;
358
0
                if (!parseEscape(in, escape_buffer, len)) {
359
0
                    err_ = JsonbErrType::E_INVALID_KEY_STRING;
360
0
                    return false;
361
0
                }
362
0
                if (key_len + len >= JsonbKeyValue::sMaxKeyLen) {
363
0
                    err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
364
0
                    return false;
365
0
                }
366
0
                memcpy(key + key_len, escape_buffer, len);
367
0
                key_len += len;
368
712
            } else {
369
712
                key[key_len++] = ch;
370
712
            }
371
712
        }
372
        // The JSON key can be an empty string.
373
336
        if (!in.good() || in.peek() != '"') {
374
0
            if (key_len == JsonbKeyValue::sMaxKeyLen)
375
0
                err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
376
0
            else
377
0
                err_ = JsonbErrType::E_INVALID_KEY_STRING;
378
0
            return false;
379
0
        }
380
381
336
        skipChar(in); // discard '"'
382
383
336
        int key_id = -1;
384
336
        if (handler) {
385
0
            key_id = handler(key, key_len);
386
0
        }
387
388
336
        if (key_id < 0) {
389
336
            writer_.writeKey(key, key_len);
390
336
        } else {
391
0
            writer_.writeKey(key_id);
392
0
        }
393
394
336
        trim(in);
395
396
336
        if (nextChar(in) != ':') {
397
0
            err_ = JsonbErrType::E_INVALID_OBJ;
398
0
            return false;
399
0
        }
400
401
336
        trim(in);
402
336
        if (!in.good()) {
403
0
            err_ = JsonbErrType::E_INVALID_OBJ;
404
0
            return false;
405
0
        }
406
407
336
        return true;
408
336
    }
409
410
    // parse a value
411
1.69k
    bool parseValue(std::istream& in, hDictInsert handler) {
412
1.69k
        bool res = false;
413
414
1.69k
        switch (in.peek()) {
415
0
        case 'N':
416
109
        case 'n': {
417
109
            skipChar(in);
418
109
            res = parseNull(in);
419
109
            break;
420
0
        }
421
0
        case 'T':
422
113
        case 't': {
423
113
            skipChar(in);
424
113
            res = parseTrue(in);
425
113
            break;
426
0
        }
427
0
        case 'F':
428
109
        case 'f': {
429
109
            skipChar(in);
430
109
            res = parseFalse(in);
431
109
            break;
432
0
        }
433
517
        case '"': {
434
517
            skipChar(in);
435
517
            res = parseString(in);
436
517
            break;
437
0
        }
438
98
        case '{': {
439
98
            skipChar(in);
440
98
            ++nesting_lvl_;
441
98
            if (nesting_lvl_ >= MaxNestingLevel) {
442
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
443
0
                return false;
444
0
            }
445
98
            res = parseObject(in, handler);
446
98
            if (res) {
447
98
                --nesting_lvl_;
448
98
            }
449
98
            break;
450
98
        }
451
2
        case '[': {
452
2
            skipChar(in);
453
2
            ++nesting_lvl_;
454
2
            if (nesting_lvl_ >= MaxNestingLevel) {
455
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
456
0
                return false;
457
0
            }
458
2
            res = parseArray(in, handler);
459
2
            if (res) {
460
2
                --nesting_lvl_;
461
2
            }
462
2
            break;
463
2
        }
464
748
        default: {
465
748
            res = parseNumber(in);
466
748
            break;
467
2
        }
468
1.69k
        }
469
470
1.69k
        return res;
471
1.69k
    }
472
473
    // parse NULL value
474
172
    bool parseNull(std::istream& in) {
475
172
        if (tolower(nextChar(in)) == 'u' && tolower(nextChar(in)) == 'l' &&
476
172
            tolower(nextChar(in)) == 'l') {
477
172
            writer_.writeNull();
478
172
            return true;
479
172
        }
480
481
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
482
0
        return false;
483
172
    }
484
485
    // parse TRUE value
486
176
    bool parseTrue(std::istream& in) {
487
176
        if (tolower(nextChar(in)) == 'r' && tolower(nextChar(in)) == 'u' &&
488
176
            tolower(nextChar(in)) == 'e') {
489
176
            writer_.writeBool(true);
490
176
            return true;
491
176
        }
492
493
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
494
0
        return false;
495
176
    }
496
497
    // parse FALSE value
498
172
    bool parseFalse(std::istream& in) {
499
172
        if (tolower(nextChar(in)) == 'a' && tolower(nextChar(in)) == 'l' &&
500
172
            tolower(nextChar(in)) == 's' && tolower(nextChar(in)) == 'e') {
501
172
            writer_.writeBool(false);
502
172
            return true;
503
172
        }
504
505
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
506
0
        return false;
507
172
    }
508
509
    /*
510
    This is a helper function to parse the hex value. hex_num means the
511
    number of digits needed to be parsed. If less than zero, then it will
512
    consider all the characters between current and any character in JsonDelim.
513
  */
514
0
    unsigned parseHexHelper(std::istream& in, uint64_t& val, unsigned hex_num = 17) {
515
        // We can't read more than 17 digits, so when read 17 digits, it's overflow
516
0
        val = 0;
517
0
        unsigned num_digits = 0;
518
0
        char ch = tolower(in.peek());
519
0
        while (in.good() && !strchr(kJsonDelim, ch) && num_digits != hex_num) {
520
0
            if (ch >= '0' && ch <= '9') {
521
0
                val = (val << 4) + (ch - '0');
522
0
            } else if (ch >= 'a' && ch <= 'f') {
523
0
                val = (val << 4) + (ch - 'a' + 10);
524
0
            } else {
525
                // unrecognized hex digit
526
0
                return 0;
527
0
            }
528
0
            skipChar(in);
529
0
            ch = tolower(in.peek());
530
0
            ++num_digits;
531
0
        }
532
0
        return num_digits;
533
0
    }
534
535
    // parse HEX value
536
0
    bool parseHex4(std::istream& in, unsigned& h) {
537
0
        uint64_t val;
538
0
        if (4 == parseHexHelper(in, val, 4)) {
539
0
            h = (unsigned)val;
540
0
            return true;
541
0
        }
542
0
        return false;
543
0
    }
544
545
    /*
546
     parse Escape char.
547
  */
548
0
    bool parseEscape(std::istream& in, char* out, int& len) {
549
        /*
550
      This is extracted from cJSON implementation.
551
      This is about the mask of the first byte in UTF-8.
552
      The mask is defined in:
553
      http://en.wikipedia.org/wiki/UTF-8#Description
554
    */
555
0
        const unsigned char firstByteMark[6] = {0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
556
0
        if (!in.good()) {
557
0
            return false;
558
0
        }
559
0
        char c = nextChar(in);
560
0
        len = 1;
561
0
        switch (c) {
562
        // \" \\ \/  \b \f \n \r \t
563
0
        case '"':
564
0
            *out = '"';
565
0
            return true;
566
0
        case '\\':
567
0
            *out = '\\';
568
0
            return true;
569
0
        case '/':
570
0
            *out = '/';
571
0
            return true;
572
0
        case 'b':
573
0
            *out = '\b';
574
0
            return true;
575
0
        case 'f':
576
0
            *out = '\f';
577
0
            return true;
578
0
        case 'n':
579
0
            *out = '\n';
580
0
            return true;
581
0
        case 'r':
582
0
            *out = '\r';
583
0
            return true;
584
0
        case 't':
585
0
            *out = '\t';
586
0
            return true;
587
0
        case 'u': {
588
0
            unsigned uc;
589
0
            if (!parseHex4(in, uc)) {
590
0
                return false;
591
0
            }
592
            /*
593
          For DC00 to DFFF, it should be low surrogates for UTF16.
594
          So if it display in the high bits, it's invalid.
595
        */
596
0
            if (uc >= 0xDC00 && uc <= 0xDFFF) {
597
0
                return false;
598
0
            }
599
600
            /*
601
          For D800 to DBFF, it's the high surrogates for UTF16.
602
          So it's utf-16, there must be another one between 0xDC00
603
          and 0xDFFF.
604
        */
605
0
            if (uc >= 0xD800 && uc <= 0xDBFF) {
606
0
                unsigned uc2;
607
608
0
                if (!in.good()) {
609
0
                    return false;
610
0
                }
611
0
                c = nextChar(in);
612
0
                if (c != '\\') {
613
0
                    return false;
614
0
                }
615
616
0
                if (!in.good()) {
617
0
                    return false;
618
0
                }
619
0
                c = nextChar(in);
620
0
                if (c != 'u') {
621
0
                    return false;
622
0
                }
623
624
0
                if (!parseHex4(in, uc2)) {
625
0
                    return false;
626
0
                }
627
                /*
628
            Now we need the low surrogates for UTF16. It should be
629
            within 0xDC00 and 0xDFFF.
630
          */
631
0
                if (uc2 < 0xDC00 || uc2 > 0xDFFF) return false;
632
                /*
633
            For the character that not in the Basic Multilingual Plan,
634
            it's represented as twelve-character, encoding the UTF-16
635
            surrogate pair.
636
            UTF16 is between 0x10000 and 0x10FFFF. The high surrogate
637
            present the high bits and the low surrogate present the
638
            lower 10 bits.
639
            For detailed explanation, please refer to:
640
            http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
641
            Then it will be converted to UTF8.
642
          */
643
0
                uc = 0x10000 + (((uc & 0x3FF) << 10) | (uc2 & 0x3FF));
644
0
            }
645
646
            /*
647
          Get the length of the unicode.
648
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
649
        */
650
0
            if (uc < 0x80)
651
0
                len = 1;
652
0
            else if (uc < 0x800)
653
0
                len = 2;
654
0
            else if (uc < 0x10000)
655
0
                len = 3;
656
0
            else
657
0
                len = 4;
658
0
            out += len;
659
            /*
660
          Encode it.
661
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
662
          This part of code has a reference to cJSON.
663
        */
664
0
            switch (len) {
665
0
            case 4:
666
0
                *--out = ((uc | 0x80) & 0xBF);
667
0
                uc >>= 6;
668
0
                [[fallthrough]];
669
0
            case 3:
670
0
                *--out = ((uc | 0x80) & 0xBF);
671
0
                uc >>= 6;
672
0
                [[fallthrough]];
673
0
            case 2:
674
0
                *--out = ((uc | 0x80) & 0xBF);
675
0
                uc >>= 6;
676
0
                [[fallthrough]];
677
0
            case 1:
678
                // Mask the first byte according to the standard.
679
0
                *--out = (uc | firstByteMark[len - 1]);
680
0
            }
681
0
            return true;
682
0
            break;
683
0
        }
684
0
        default:
685
0
            return false;
686
0
            break;
687
0
        }
688
0
    }
689
690
    // parse a string
691
588
    bool parseString(std::istream& in) {
692
588
        const int BUFFER_LEN = 4096;
693
588
        if (!writer_.writeStartString()) {
694
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
695
0
            return false;
696
0
        }
697
698
        // write 4KB at a time
699
588
        char buffer[BUFFER_LEN];
700
588
        int nread = 0;
701
2.24k
        while (in.good()) {
702
2.24k
            char ch = nextChar(in);
703
2.24k
            if (ch == '"') {
704
                // write all remaining bytes in the buffer
705
588
                if (nread > 0) {
706
588
                    if (!writer_.writeString(buffer, nread)) {
707
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
708
0
                        return false;
709
0
                    }
710
588
                }
711
                // end writing string
712
588
                if (!writer_.writeEndString()) {
713
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
714
0
                    return false;
715
0
                }
716
588
                return true;
717
1.65k
            } else if (ch == '\\') {
718
                // this is a escape char
719
0
                char escape_buffer[5]; // buffer for escape
720
0
                int len;
721
0
                if (!parseEscape(in, escape_buffer, len)) {
722
0
                    err_ = JsonbErrType::E_INVALID_STR;
723
0
                    return false;
724
0
                }
725
726
                // Write each char to the buffer
727
0
                for (int i = 0; i != len; ++i) {
728
0
                    buffer[nread++] = escape_buffer[i];
729
0
                    if (nread == BUFFER_LEN) {
730
0
                        if (!writer_.writeString(buffer, nread)) {
731
0
                            err_ = JsonbErrType::E_OUTPUT_FAIL;
732
0
                            return false;
733
0
                        }
734
0
                        nread = 0;
735
0
                    }
736
0
                }
737
1.65k
            } else {
738
                // just a char
739
1.65k
                buffer[nread++] = ch;
740
1.65k
                if (nread == BUFFER_LEN) {
741
                    // flush buffer
742
0
                    if (!writer_.writeString(buffer, nread)) {
743
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
744
0
                        return false;
745
0
                    }
746
0
                    nread = 0;
747
0
                }
748
1.65k
            }
749
2.24k
        }
750
751
0
        err_ = JsonbErrType::E_INVALID_STR;
752
0
        return false;
753
588
    }
754
755
    // parse a number
756
    // Number format can be hex, octal, or decimal (including float).
757
    // Only decimal can have (+/-) sign prefix.
758
1.16k
    bool parseNumber(std::istream& in) {
759
1.16k
        bool ret = false;
760
1.16k
        switch (in.peek()) {
761
0
        case '0': {
762
0
            skipChar(in);
763
764
0
            if (in.peek() == 'x' || in.peek() == 'X') {
765
0
                skipChar(in);
766
0
                ret = parseHex(in);
767
0
            } else if (in.peek() == '.') {
768
0
                skipChar(in); // remove '.'
769
0
                num_buf_[0] = '.';
770
0
                ret = parseDouble(in, num_buf_ + 1);
771
0
            } else {
772
0
                ret = parseOctal(in);
773
0
            }
774
775
0
            break;
776
0
        }
777
0
        case '-': {
778
0
            skipChar(in);
779
0
            ret = parseDecimal(in, true);
780
0
            break;
781
0
        }
782
0
        case '+':
783
0
            skipChar(in);
784
        // fall through
785
1.16k
        default:
786
1.16k
            ret = parseDecimal(in);
787
1.16k
            break;
788
1.16k
        }
789
790
1.16k
        return ret;
791
1.16k
    }
792
793
    // parse a number in hex format
794
0
    bool parseHex(std::istream& in) {
795
0
        uint64_t val = 0;
796
0
        int num_digits;
797
0
        if (0 == (num_digits = parseHexHelper(in, val))) {
798
0
            err_ = JsonbErrType::E_INVALID_HEX;
799
0
            return false;
800
0
        }
801
802
0
        int size = 0;
803
0
        if (num_digits <= 2) {
804
0
            size = writer_.writeInt8((int8_t)val);
805
0
        } else if (num_digits <= 4) {
806
0
            size = writer_.writeInt16((int16_t)val);
807
0
        } else if (num_digits <= 8) {
808
0
            size = writer_.writeInt32((int32_t)val);
809
0
        } else if (num_digits <= 16) {
810
0
            size = writer_.writeInt64(val);
811
0
        } else {
812
0
            err_ = JsonbErrType::E_HEX_OVERFLOW;
813
0
            return false;
814
0
        }
815
816
0
        if (size == 0) {
817
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
818
0
            return false;
819
0
        }
820
821
0
        return true;
822
0
    }
823
824
    // parse a number in octal format
825
0
    bool parseOctal(std::istream& in) {
826
0
        int64_t val = 0;
827
0
        char ch = in.peek();
828
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
829
0
            if (ch >= '0' && ch <= '7') {
830
0
                val = val * 8 + (ch - '0');
831
0
            } else {
832
0
                err_ = JsonbErrType::E_INVALID_OCTAL;
833
0
                return false;
834
0
            }
835
836
            // check if the number overflows
837
0
            if (val < 0) {
838
0
                err_ = JsonbErrType::E_OCTAL_OVERFLOW;
839
0
                return false;
840
0
            }
841
842
0
            skipChar(in);
843
0
            ch = in.peek();
844
0
        }
845
846
0
        int size = 0;
847
0
        if (val <= std::numeric_limits<int8_t>::max()) {
848
0
            size = writer_.writeInt8((int8_t)val);
849
0
        } else if (val <= std::numeric_limits<int16_t>::max()) {
850
0
            size = writer_.writeInt16((int16_t)val);
851
0
        } else if (val <= std::numeric_limits<int32_t>::max()) {
852
0
            size = writer_.writeInt32((int32_t)val);
853
0
        } else { // val <= INT64_MAX
854
0
            size = writer_.writeInt64(val);
855
0
        }
856
857
0
        if (size == 0) {
858
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
859
0
            return false;
860
0
        }
861
862
0
        return true;
863
0
    }
864
865
    // parse a number in decimal (including float)
866
1.16k
    bool parseDecimal(std::istream& in, bool neg = false) {
867
1.16k
        char ch = 0;
868
1.16k
        while (in.good() && (ch = in.peek()) == '0') skipChar(in);
869
870
1.16k
        char* pbuf = num_buf_;
871
1.16k
        if (neg) *(pbuf++) = '-';
872
873
1.16k
        char* save_pos = pbuf;
874
5.22k
        while (in.good() && !strchr(kJsonDelim, ch)) {
875
4.43k
            *(pbuf++) = ch;
876
4.43k
            if (pbuf == end_buf_) {
877
0
                err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
878
0
                return false;
879
0
            }
880
881
4.43k
            if (ch == '.') {
882
282
                skipChar(in); // remove '.'
883
282
                return parseDouble(in, pbuf);
884
4.15k
            } else if (ch == 'E' || ch == 'e') {
885
0
                skipChar(in); // remove 'E'
886
0
                return parseExponent(in, pbuf);
887
4.15k
            } else if (ch < '0' || ch > '9') {
888
96
                err_ = JsonbErrType::E_INVALID_DECIMAL;
889
96
                return false;
890
96
            }
891
892
4.05k
            skipChar(in);
893
4.05k
            ch = in.peek();
894
4.05k
        }
895
787
        if (save_pos == pbuf) {
896
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
897
0
            return false;
898
0
        }
899
900
787
        *pbuf = 0; // set null-terminator
901
787
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
902
787
        int128_t val =
903
787
                StringParser::string_to_int<int128_t>(num_buf_, pbuf - num_buf_, &parse_result);
904
787
        if (parse_result != StringParser::PARSE_SUCCESS) {
905
0
            VLOG_ROW << "debug string_to_int error for " << num_buf_ << " val=" << val
906
0
                     << " parse_result=" << parse_result;
907
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
908
0
            return false;
909
0
        }
910
911
787
        int size = 0;
912
787
        if (val >= std::numeric_limits<int8_t>::min() &&
913
787
            val <= std::numeric_limits<int8_t>::max()) {
914
363
            size = writer_.writeInt8((int8_t)val);
915
424
        } else if (val >= std::numeric_limits<int16_t>::min() &&
916
424
                   val <= std::numeric_limits<int16_t>::max()) {
917
298
            size = writer_.writeInt16((int16_t)val);
918
298
        } else if (val >= std::numeric_limits<int32_t>::min() &&
919
126
                   val <= std::numeric_limits<int32_t>::max()) {
920
63
            size = writer_.writeInt32((int32_t)val);
921
63
        } else if (val >= std::numeric_limits<int64_t>::min() &&
922
63
                   val <= std::numeric_limits<int64_t>::max()) {
923
63
            size = writer_.writeInt64((int64_t)val);
924
63
        } else { // INT128
925
0
            size = writer_.writeInt128(val);
926
0
        }
927
928
787
        if (size == 0) {
929
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
930
0
            return false;
931
0
        }
932
933
787
        return true;
934
787
    }
935
936
    // parse IEEE745 double precision
937
282
    bool parseDouble(std::istream& in, char* pbuf) {
938
282
        char* save_pos = pbuf;
939
282
        char ch = in.peek();
940
824
        while (in.good() && !strchr(kJsonDelim, ch)) {
941
553
            *(pbuf++) = ch;
942
553
            if (pbuf == end_buf_) {
943
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
944
0
                return false;
945
0
            }
946
947
553
            if (ch == 'e' || ch == 'E') {
948
0
                skipChar(in); // remove 'E'
949
0
                return parseExponent(in, pbuf);
950
553
            } else if (ch < '0' || ch > '9') {
951
11
                err_ = JsonbErrType::E_INVALID_DECIMAL;
952
11
                return false;
953
11
            }
954
955
542
            skipChar(in);
956
542
            ch = in.peek();
957
542
        }
958
271
        if (save_pos == pbuf) {
959
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
960
0
            return false;
961
0
        }
962
963
271
        *pbuf = 0; // set null-terminator
964
271
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
965
271
    }
966
967
    // parse the exponent part of a double number
968
0
    bool parseExponent(std::istream& in, char* pbuf) {
969
0
        char ch = in.peek();
970
0
        if (in.good()) {
971
0
            if (ch == '+' || ch == '-') {
972
0
                *(pbuf++) = ch;
973
0
                if (pbuf == end_buf_) {
974
0
                    err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
975
0
                    return false;
976
0
                }
977
0
                skipChar(in);
978
0
                ch = in.peek();
979
0
            }
980
0
        }
981
982
0
        char* save_pos = pbuf;
983
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
984
0
            *(pbuf++) = ch;
985
0
            if (pbuf == end_buf_) {
986
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
987
0
                return false;
988
0
            }
989
990
0
            if (ch < '0' || ch > '9') {
991
0
                err_ = JsonbErrType::E_INVALID_EXPONENT;
992
0
                return false;
993
0
            }
994
995
0
            skipChar(in);
996
0
            ch = in.peek();
997
0
        }
998
0
        if (save_pos == pbuf) {
999
0
            err_ = JsonbErrType::E_INVALID_EXPONENT; // empty input
1000
0
            return false;
1001
0
        }
1002
1003
0
        *pbuf = 0; // set null-terminator
1004
0
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
1005
0
    }
1006
1007
    // call system function to parse double to string
1008
271
    bool internConvertBufferToDouble(char* num_buf_, int len) {
1009
271
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
1010
271
        double val = StringParser::string_to_float<double>(num_buf_, len, &parse_result);
1011
271
        if (parse_result != StringParser::PARSE_SUCCESS) {
1012
0
            VLOG_ROW << "debug string_to_float error for " << num_buf_ << " val=" << val
1013
0
                     << " parse_result=" << parse_result;
1014
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
1015
0
            return false;
1016
0
        }
1017
1018
271
        if (writer_.writeDouble(val) == 0) {
1019
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
1020
0
            return false;
1021
0
        }
1022
1023
271
        return true;
1024
271
    }
1025
1026
6.61k
    void trim(std::istream& in) {
1027
7.93k
        while (in.good() && strchr(kWhiteSpace, in.peek())) {
1028
1.32k
            skipChar(in);
1029
1.32k
        }
1030
6.61k
    }
1031
1032
    /*
1033
   * Helper functions to keep track of characters read.
1034
   * Do not rely on std::istream's tellg() which may not be implemented.
1035
   */
1036
1037
7.05k
    char nextChar(std::istream& in) {
1038
7.05k
        ++stream_pos_;
1039
7.05k
        return in.get();
1040
7.05k
    }
1041
1042
8.45k
    void skipChar(std::istream& in) {
1043
8.45k
        ++stream_pos_;
1044
8.45k
        in.ignore();
1045
8.45k
    }
1046
1047
private:
1048
    JsonbWriterT<OS_TYPE> writer_;
1049
    uint32_t stream_pos_;
1050
    JsonbErrType err_;
1051
    char num_buf_[512]; // buffer to hold number string
1052
    const char* end_buf_ = num_buf_ + sizeof(num_buf_) - 1;
1053
    uint32_t nesting_lvl_ = 0;
1054
};
1055
1056
typedef JsonbParserT<JsonbOutStream> JsonbParser;
1057
1058
} // namespace doris
1059
1060
#endif // JSONB_JSONBJSONPARSER_H