Coverage Report

Created: 2024-11-21 23:27

/root/doris/be/src/util/jsonb_parser.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserT (template) and JsonbParser.
13
 *
14
 * JsonbParserT is a template class which implements a JSON parser.
15
 * JsonbParserT parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserT creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserT with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserT parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserT object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#ifndef JSONB_JSONBJSONPARSER_H
60
#define JSONB_JSONBJSONPARSER_H
61
62
#include <cmath>
63
#include <limits>
64
65
#include "jsonb_document.h"
66
#include "jsonb_error.h"
67
#include "jsonb_writer.h"
68
#include "string_parser.hpp"
69
70
namespace doris {
71
72
const char* const kJsonDelim = " ,]}\t\r\n";
73
const char* const kWhiteSpace = " \t\n\r";
74
75
/*
76
 * Template JsonbParserT
77
 */
78
template <class OS_TYPE>
79
class JsonbParserT {
80
public:
81
991
    JsonbParserT() : stream_pos_(0), err_(JsonbErrType::E_NONE) {}
82
83
    explicit JsonbParserT(OS_TYPE& os) : writer_(os), stream_pos_(0), err_(JsonbErrType::E_NONE) {}
84
85
    // parse a UTF-8 JSON string
86
    bool parse(const std::string& str, hDictInsert handler = nullptr) {
87
        return parse(str.c_str(), str.size(), handler);
88
    }
89
90
    // parse a UTF-8 JSON c-style string (NULL terminated)
91
    bool parse(const char* c_str, hDictInsert handler = nullptr) {
92
        return parse(c_str, strlen(c_str), handler);
93
    }
94
95
    // parse a UTF-8 JSON string with length
96
1.19k
    bool parse(const char* pch, size_t len, hDictInsert handler = nullptr) {
97
1.19k
        if (!pch || len == 0) {
98
0
            err_ = JsonbErrType::E_EMPTY_DOCUMENT;
99
0
            return false;
100
0
        }
101
102
1.19k
        JsonbInBuffer sb(pch, len);
103
1.19k
        std::istream in(&sb);
104
1.19k
        return parse(in, handler);
105
1.19k
    }
106
107
    // parse UTF-8 JSON text from an input stream
108
1.19k
    bool parse(std::istream& in, hDictInsert handler = nullptr) {
109
1.19k
        bool res = false;
110
1.19k
        err_ = JsonbErrType::E_NONE;
111
1.19k
        stream_pos_ = 0;
112
113
        // reset output stream
114
1.19k
        writer_.reset();
115
116
1.19k
        trim(in);
117
118
        // TODO(wzy): parsePrimitive should be implemented
119
1.19k
        if (in.peek() == '{') {
120
140
            skipChar(in);
121
140
            res = parseObject(in, handler);
122
1.05k
        } else if (in.peek() == '[') {
123
426
            skipChar(in);
124
426
            res = parseArray(in, handler);
125
627
        } else {
126
627
            res = parsePrimitive(in, handler);
127
627
            if (!res) err_ = handle_parse_failure(in);
128
627
        }
129
130
1.19k
        trim(in);
131
1.19k
        if (res && !in.eof()) {
132
0
            err_ = JsonbErrType::E_INVALID_DOCU;
133
0
            return false;
134
0
        }
135
136
1.19k
        return res;
137
1.19k
    }
138
139
2.25k
    JsonbWriterT<OS_TYPE>& getWriter() { return writer_; }
140
141
68
    JsonbErrType getErrorCode() { return err_; }
142
143
    JsonbErrInfo getErrorInfo() {
144
        assert(err_ < JsonbErrType::E_NUM_ERRORS);
145
146
        JsonbErrInfo err_info;
147
148
        // stream_pos_ always points to the next char, so err_pos is 1-based
149
        err_info.err_pos = stream_pos_;
150
        err_info.err_msg = JsonbErrMsg::getErrMsg(err_);
151
152
        return err_info;
153
    }
154
155
    // clear error code
156
    void clearErr() { err_ = JsonbErrType::E_NONE; }
157
158
private:
159
0
    JsonbErrType handle_parse_value_failure(bool parse_res, std::istream& in) {
160
0
        if (parse_res) {
161
0
            trim(in);
162
0
            if (!in.good()) {
163
0
                return JsonbErrType::E_INVALID_DOCU_COMPAT;
164
0
            }
165
0
        }
166
0
        return JsonbErrType::E_INVALID_DOCU;
167
0
        ;
168
0
    }
169
170
    // In case json is determined to be invalid at top level,
171
    // try to parse literal values.
172
    // We return a different error code E_INVALID_DOCU_COMPAT
173
    // in case the input json contains these values.
174
    // Returning a different error code will cause an
175
    // auditing on the caller.
176
    // This is mainly done because 8.0 JSON_VALID considers
177
    // this as a valid input.
178
46
    JsonbErrType handle_parse_failure(std::istream& in) {
179
46
        JsonbErrType error = JsonbErrType::E_INVALID_DOCU;
180
46
        if (!writer_.writeStartArray()) {
181
0
            return error;
182
0
        }
183
184
46
        switch (in.peek()) {
185
0
        case 'n':
186
0
            skipChar(in);
187
0
            error = handle_parse_value_failure(parseNull(in), in);
188
0
            break;
189
0
        case 't':
190
0
            skipChar(in);
191
0
            error = handle_parse_value_failure(parseTrue(in), in);
192
0
            break;
193
0
        case 'f':
194
0
            skipChar(in);
195
0
            error = handle_parse_value_failure(parseFalse(in), in);
196
0
            break;
197
0
        case '"':
198
0
            skipChar(in);
199
0
            error = handle_parse_value_failure(parseString(in), in);
200
0
            break;
201
46
        default:
202
46
            if (parseNumber(in)) {
203
0
                trim(in);
204
0
                if (in.eof()) {
205
0
                    error = JsonbErrType::E_INVALID_DOCU_COMPAT;
206
0
                }
207
0
            }
208
46
        }
209
46
        if (!writer_.writeEndArray()) {
210
0
            return error;
211
0
        }
212
213
46
        return error;
214
46
    }
215
216
    // parse primitive
217
627
    bool parsePrimitive(std::istream& in, hDictInsert handler) {
218
627
        bool res = false;
219
627
        switch (in.peek()) {
220
63
        case 'n':
221
63
            skipChar(in);
222
63
            res = parseNull(in);
223
63
            break;
224
63
        case 't':
225
63
            skipChar(in);
226
63
            res = parseTrue(in);
227
63
            break;
228
63
        case 'f':
229
63
            skipChar(in);
230
63
            res = parseFalse(in);
231
63
            break;
232
71
        case '"':
233
71
            skipChar(in);
234
71
            res = parseString(in);
235
71
            break;
236
367
        default:
237
367
            res = parseNumber(in);
238
627
        }
239
240
627
        return res;
241
627
    }
242
243
    // parse a JSON object (comma-separated list of key-value pairs)
244
236
    bool parseObject(std::istream& in, hDictInsert handler) {
245
236
        if (!writer_.writeStartObject()) {
246
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
247
0
            return false;
248
0
        }
249
250
236
        trim(in);
251
252
236
        if (in.peek() == '}') {
253
66
            skipChar(in);
254
            // empty object
255
66
            if (!writer_.writeEndObject()) {
256
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
257
0
                return false;
258
0
            }
259
66
            return true;
260
66
        }
261
262
329
        while (in.good()) {
263
329
            if (nextChar(in) != '"') {
264
11
                err_ = JsonbErrType::E_INVALID_OBJ;
265
11
                return false;
266
11
            }
267
268
318
            if (!parseKVPair(in, handler)) {
269
0
                return false;
270
0
            }
271
272
318
            trim(in);
273
274
318
            char ch = nextChar(in);
275
318
            if (ch == '}') {
276
                // end of the object
277
159
                if (!writer_.writeEndObject()) {
278
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
279
0
                    return false;
280
0
                }
281
159
                return true;
282
159
            } else if (ch != ',') {
283
0
                err_ = JsonbErrType::E_INVALID_OBJ;
284
0
                return false;
285
0
            }
286
287
159
            trim(in);
288
159
        }
289
290
0
        err_ = JsonbErrType::E_INVALID_OBJ;
291
0
        return false;
292
170
    }
293
294
    // parse a JSON array (comma-separated list of values)
295
426
    bool parseArray(std::istream& in, hDictInsert handler) {
296
426
        if (!writer_.writeStartArray()) {
297
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
298
0
            return false;
299
0
        }
300
301
426
        trim(in);
302
303
426
        if (in.peek() == ']') {
304
63
            skipChar(in);
305
            // empty array
306
63
            if (!writer_.writeEndArray()) {
307
0
                err_ = JsonbErrType::E_OUTPUT_FAIL;
308
0
                return false;
309
0
            }
310
63
            return true;
311
63
        }
312
313
1.35k
        while (in.good()) {
314
1.35k
            if (!parseValue(in, handler)) {
315
11
                return false;
316
11
            }
317
318
1.34k
            trim(in);
319
320
1.34k
            char ch = nextChar(in);
321
1.34k
            if (ch == ']') {
322
                // end of the array
323
352
                if (!writer_.writeEndArray()) {
324
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
325
0
                    return false;
326
0
                }
327
352
                return true;
328
991
            } else if (ch != ',') {
329
0
                err_ = JsonbErrType::E_INVALID_ARR;
330
0
                return false;
331
0
            }
332
333
991
            trim(in);
334
991
        }
335
336
0
        err_ = JsonbErrType::E_INVALID_ARR;
337
0
        return false;
338
363
    }
339
340
    // parse a key-value pair, separated by ":"
341
318
    bool parseKVPair(std::istream& in, hDictInsert handler) {
342
318
        if (parseKey(in, handler) && parseValue(in, handler)) {
343
318
            return true;
344
318
        }
345
346
0
        return false;
347
318
    }
348
349
    // parse a key (must be string)
350
318
    bool parseKey(std::istream& in, hDictInsert handler) {
351
318
        char key[JsonbKeyValue::sMaxKeyLen];
352
318
        int key_len = 0;
353
954
        while (in.good() && in.peek() != '"' && key_len < JsonbKeyValue::sMaxKeyLen) {
354
636
            char ch = nextChar(in);
355
636
            if (ch == '\\') {
356
0
                char escape_buffer[5]; // buffer for escape
357
0
                int len;
358
0
                if (!parseEscape(in, escape_buffer, len)) {
359
0
                    err_ = JsonbErrType::E_INVALID_KEY_STRING;
360
0
                    return false;
361
0
                }
362
0
                if (key_len + len >= JsonbKeyValue::sMaxKeyLen) {
363
0
                    err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
364
0
                    return false;
365
0
                }
366
0
                memcpy(key + key_len, escape_buffer, len);
367
0
                key_len += len;
368
636
            } else {
369
636
                key[key_len++] = ch;
370
636
            }
371
636
        }
372
        // The JSON key can be an empty string.
373
318
        if (!in.good() || in.peek() != '"') {
374
0
            if (key_len == JsonbKeyValue::sMaxKeyLen)
375
0
                err_ = JsonbErrType::E_INVALID_KEY_LENGTH;
376
0
            else
377
0
                err_ = JsonbErrType::E_INVALID_KEY_STRING;
378
0
            return false;
379
0
        }
380
381
318
        skipChar(in); // discard '"'
382
383
318
        int key_id = -1;
384
318
        if (handler) {
385
0
            key_id = handler(key, key_len);
386
0
        }
387
388
318
        if (key_id < 0) {
389
318
            writer_.writeKey(key, key_len);
390
318
        } else {
391
0
            writer_.writeKey(key_id);
392
0
        }
393
394
318
        trim(in);
395
396
318
        if (nextChar(in) != ':') {
397
0
            err_ = JsonbErrType::E_INVALID_OBJ;
398
0
            return false;
399
0
        }
400
401
318
        trim(in);
402
318
        if (!in.good()) {
403
0
            err_ = JsonbErrType::E_INVALID_OBJ;
404
0
            return false;
405
0
        }
406
407
318
        return true;
408
318
    }
409
410
    // parse a value
411
1.67k
    bool parseValue(std::istream& in, hDictInsert handler) {
412
1.67k
        bool res = false;
413
414
1.67k
        switch (in.peek()) {
415
0
        case 'N':
416
109
        case 'n': {
417
109
            skipChar(in);
418
109
            res = parseNull(in);
419
109
            break;
420
0
        }
421
0
        case 'T':
422
109
        case 't': {
423
109
            skipChar(in);
424
109
            res = parseTrue(in);
425
109
            break;
426
0
        }
427
0
        case 'F':
428
109
        case 'f': {
429
109
            skipChar(in);
430
109
            res = parseFalse(in);
431
109
            break;
432
0
        }
433
511
        case '"': {
434
511
            skipChar(in);
435
511
            res = parseString(in);
436
511
            break;
437
0
        }
438
96
        case '{': {
439
96
            skipChar(in);
440
96
            ++nesting_lvl_;
441
96
            if (nesting_lvl_ >= MaxNestingLevel) {
442
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
443
0
                return false;
444
0
            }
445
96
            res = parseObject(in, handler);
446
96
            if (res) {
447
96
                --nesting_lvl_;
448
96
            }
449
96
            break;
450
96
        }
451
0
        case '[': {
452
0
            skipChar(in);
453
0
            ++nesting_lvl_;
454
0
            if (nesting_lvl_ >= MaxNestingLevel) {
455
0
                err_ = JsonbErrType::E_NESTING_LVL_OVERFLOW;
456
0
                return false;
457
0
            }
458
0
            res = parseArray(in, handler);
459
0
            if (res) {
460
0
                --nesting_lvl_;
461
0
            }
462
0
            break;
463
0
        }
464
738
        default: {
465
738
            res = parseNumber(in);
466
738
            break;
467
0
        }
468
1.67k
        }
469
470
1.67k
        return res;
471
1.67k
    }
472
473
    // parse NULL value
474
172
    bool parseNull(std::istream& in) {
475
172
        if (tolower(nextChar(in)) == 'u' && tolower(nextChar(in)) == 'l' &&
476
172
            tolower(nextChar(in)) == 'l') {
477
172
            writer_.writeNull();
478
172
            return true;
479
172
        }
480
481
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
482
0
        return false;
483
172
    }
484
485
    // parse TRUE value
486
172
    bool parseTrue(std::istream& in) {
487
172
        if (tolower(nextChar(in)) == 'r' && tolower(nextChar(in)) == 'u' &&
488
172
            tolower(nextChar(in)) == 'e') {
489
172
            writer_.writeBool(true);
490
172
            return true;
491
172
        }
492
493
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
494
0
        return false;
495
172
    }
496
497
    // parse FALSE value
498
172
    bool parseFalse(std::istream& in) {
499
172
        if (tolower(nextChar(in)) == 'a' && tolower(nextChar(in)) == 'l' &&
500
172
            tolower(nextChar(in)) == 's' && tolower(nextChar(in)) == 'e') {
501
172
            writer_.writeBool(false);
502
172
            return true;
503
172
        }
504
505
0
        err_ = JsonbErrType::E_INVALID_SCALAR_VALUE;
506
0
        return false;
507
172
    }
508
509
    /*
510
    This is a helper function to parse the hex value. hex_num means the
511
    number of digits needed to be parsed. If less than zero, then it will
512
    consider all the characters between current and any character in JsonDelim.
513
  */
514
0
    unsigned parseHexHelper(std::istream& in, uint64_t& val, unsigned hex_num = 17) {
515
        // We can't read more than 17 digits, so when read 17 digits, it's overflow
516
0
        val = 0;
517
0
        unsigned num_digits = 0;
518
0
        char ch = tolower(in.peek());
519
0
        while (in.good() && !strchr(kJsonDelim, ch) && num_digits != hex_num) {
520
0
            if (ch >= '0' && ch <= '9') {
521
0
                val = (val << 4) + (ch - '0');
522
0
            } else if (ch >= 'a' && ch <= 'f') {
523
0
                val = (val << 4) + (ch - 'a' + 10);
524
0
            } else {
525
                // unrecognized hex digit
526
0
                return 0;
527
0
            }
528
0
            skipChar(in);
529
0
            ch = tolower(in.peek());
530
0
            ++num_digits;
531
0
        }
532
0
        return num_digits;
533
0
    }
534
535
    // parse HEX value
536
0
    bool parseHex4(std::istream& in, unsigned& h) {
537
0
        uint64_t val;
538
0
        if (4 == parseHexHelper(in, val, 4)) {
539
0
            h = (unsigned)val;
540
0
            return true;
541
0
        }
542
0
        return false;
543
0
    }
544
545
    /*
546
     parse Escape char.
547
  */
548
0
    bool parseEscape(std::istream& in, char* out, int& len) {
549
        /*
550
      This is extracted from cJSON implementation.
551
      This is about the mask of the first byte in UTF-8.
552
      The mask is defined in:
553
      http://en.wikipedia.org/wiki/UTF-8#Description
554
    */
555
0
        const unsigned char firstByteMark[6] = {0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
556
0
        if (!in.good()) {
557
0
            return false;
558
0
        }
559
0
        char c = nextChar(in);
560
0
        len = 1;
561
0
        switch (c) {
562
        // \" \\ \/  \b \f \n \r \t
563
0
        case '"':
564
0
            *out = '"';
565
0
            return true;
566
0
        case '\\':
567
0
            *out = '\\';
568
0
            return true;
569
0
        case '/':
570
0
            *out = '/';
571
0
            return true;
572
0
        case 'b':
573
0
            *out = '\b';
574
0
            return true;
575
0
        case 'f':
576
0
            *out = '\f';
577
0
            return true;
578
0
        case 'n':
579
0
            *out = '\n';
580
0
            return true;
581
0
        case 'r':
582
0
            *out = '\r';
583
0
            return true;
584
0
        case 't':
585
0
            *out = '\t';
586
0
            return true;
587
0
        case 'u': {
588
0
            unsigned uc;
589
0
            if (!parseHex4(in, uc)) {
590
0
                return false;
591
0
            }
592
            /*
593
          For DC00 to DFFF, it should be low surrogates for UTF16.
594
          So if it display in the high bits, it's invalid.
595
        */
596
0
            if (uc >= 0xDC00 && uc <= 0xDFFF) {
597
0
                return false;
598
0
            }
599
600
            /*
601
          For D800 to DBFF, it's the high surrogates for UTF16.
602
          So it's utf-16, there must be another one between 0xDC00
603
          and 0xDFFF.
604
        */
605
0
            if (uc >= 0xD800 && uc <= 0xDBFF) {
606
0
                unsigned uc2;
607
608
0
                if (!in.good()) {
609
0
                    return false;
610
0
                }
611
0
                c = nextChar(in);
612
0
                if (c != '\\') {
613
0
                    return false;
614
0
                }
615
616
0
                if (!in.good()) {
617
0
                    return false;
618
0
                }
619
0
                c = nextChar(in);
620
0
                if (c != 'u') {
621
0
                    return false;
622
0
                }
623
624
0
                if (!parseHex4(in, uc2)) {
625
0
                    return false;
626
0
                }
627
                /*
628
            Now we need the low surrogates for UTF16. It should be
629
            within 0xDC00 and 0xDFFF.
630
          */
631
0
                if (uc2 < 0xDC00 || uc2 > 0xDFFF) return false;
632
                /*
633
            For the character that not in the Basic Multilingual Plan,
634
            it's represented as twelve-character, encoding the UTF-16
635
            surrogate pair.
636
            UTF16 is between 0x10000 and 0x10FFFF. The high surrogate
637
            present the high bits and the low surrogate present the
638
            lower 10 bits.
639
            For detailed explanation, please refer to:
640
            http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
641
            Then it will be converted to UTF8.
642
          */
643
0
                uc = 0x10000 + (((uc & 0x3FF) << 10) | (uc2 & 0x3FF));
644
0
            }
645
646
            /*
647
          Get the length of the unicode.
648
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
649
        */
650
0
            if (uc < 0x80)
651
0
                len = 1;
652
0
            else if (uc < 0x800)
653
0
                len = 2;
654
0
            else if (uc < 0x10000)
655
0
                len = 3;
656
0
            else
657
0
                len = 4;
658
0
            out += len;
659
            /*
660
          Encode it.
661
          Please refer to http://en.wikipedia.org/wiki/UTF-8#Description.
662
          This part of code has a reference to cJSON.
663
        */
664
0
            switch (len) {
665
0
            case 4:
666
0
                *--out = ((uc | 0x80) & 0xBF);
667
0
                uc >>= 6;
668
0
                [[fallthrough]];
669
0
            case 3:
670
0
                *--out = ((uc | 0x80) & 0xBF);
671
0
                uc >>= 6;
672
0
                [[fallthrough]];
673
0
            case 2:
674
0
                *--out = ((uc | 0x80) & 0xBF);
675
0
                uc >>= 6;
676
0
                [[fallthrough]];
677
0
            case 1:
678
                // Mask the first byte according to the standard.
679
0
                *--out = (uc | firstByteMark[len - 1]);
680
0
            }
681
0
            return true;
682
0
            break;
683
0
        }
684
0
        default:
685
0
            return false;
686
0
            break;
687
0
        }
688
0
    }
689
690
    // parse a string
691
582
    bool parseString(std::istream& in) {
692
582
        const int BUFFER_LEN = 4096;
693
582
        if (!writer_.writeStartString()) {
694
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
695
0
            return false;
696
0
        }
697
698
        // write 4KB at a time
699
582
        char buffer[BUFFER_LEN];
700
582
        int nread = 0;
701
2.20k
        while (in.good()) {
702
2.20k
            char ch = nextChar(in);
703
2.20k
            if (ch == '"') {
704
                // write all remaining bytes in the buffer
705
582
                if (nread > 0) {
706
582
                    if (!writer_.writeString(buffer, nread)) {
707
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
708
0
                        return false;
709
0
                    }
710
582
                }
711
                // end writing string
712
582
                if (!writer_.writeEndString()) {
713
0
                    err_ = JsonbErrType::E_OUTPUT_FAIL;
714
0
                    return false;
715
0
                }
716
582
                return true;
717
1.61k
            } else if (ch == '\\') {
718
                // this is a escape char
719
0
                char escape_buffer[5]; // buffer for escape
720
0
                int len;
721
0
                if (!parseEscape(in, escape_buffer, len)) {
722
0
                    err_ = JsonbErrType::E_INVALID_STR;
723
0
                    return false;
724
0
                }
725
726
                // Write each char to the buffer
727
0
                for (int i = 0; i != len; ++i) {
728
0
                    buffer[nread++] = escape_buffer[i];
729
0
                    if (nread == BUFFER_LEN) {
730
0
                        if (!writer_.writeString(buffer, nread)) {
731
0
                            err_ = JsonbErrType::E_OUTPUT_FAIL;
732
0
                            return false;
733
0
                        }
734
0
                        nread = 0;
735
0
                    }
736
0
                }
737
1.61k
            } else {
738
                // just a char
739
1.61k
                buffer[nread++] = ch;
740
1.61k
                if (nread == BUFFER_LEN) {
741
                    // flush buffer
742
0
                    if (!writer_.writeString(buffer, nread)) {
743
0
                        err_ = JsonbErrType::E_OUTPUT_FAIL;
744
0
                        return false;
745
0
                    }
746
0
                    nread = 0;
747
0
                }
748
1.61k
            }
749
2.20k
        }
750
751
0
        err_ = JsonbErrType::E_INVALID_STR;
752
0
        return false;
753
582
    }
754
755
    // parse a number
756
    // Number format can be hex, octal, or decimal (including float).
757
    // Only decimal can have (+/-) sign prefix.
758
1.15k
    bool parseNumber(std::istream& in) {
759
1.15k
        bool ret = false;
760
1.15k
        switch (in.peek()) {
761
0
        case '0': {
762
0
            skipChar(in);
763
764
0
            if (in.peek() == 'x' || in.peek() == 'X') {
765
0
                skipChar(in);
766
0
                ret = parseHex(in);
767
0
            } else if (in.peek() == '.') {
768
0
                skipChar(in); // remove '.'
769
0
                num_buf_[0] = '.';
770
0
                ret = parseDouble(in, num_buf_ + 1);
771
0
            } else {
772
0
                ret = parseOctal(in);
773
0
            }
774
775
0
            break;
776
0
        }
777
0
        case '-': {
778
0
            skipChar(in);
779
0
            ret = parseDecimal(in, true);
780
0
            break;
781
0
        }
782
0
        case '+':
783
0
            skipChar(in);
784
        // fall through
785
1.15k
        default:
786
1.15k
            ret = parseDecimal(in);
787
1.15k
            break;
788
1.15k
        }
789
790
1.15k
        return ret;
791
1.15k
    }
792
793
    // parse a number in hex format
794
0
    bool parseHex(std::istream& in) {
795
0
        uint64_t val = 0;
796
0
        int num_digits;
797
0
        if (0 == (num_digits = parseHexHelper(in, val))) {
798
0
            err_ = JsonbErrType::E_INVALID_HEX;
799
0
            return false;
800
0
        }
801
802
0
        int size = 0;
803
0
        if (num_digits <= 2) {
804
0
            size = writer_.writeInt8((int8_t)val);
805
0
        } else if (num_digits <= 4) {
806
0
            size = writer_.writeInt16((int16_t)val);
807
0
        } else if (num_digits <= 8) {
808
0
            size = writer_.writeInt32((int32_t)val);
809
0
        } else if (num_digits <= 16) {
810
0
            size = writer_.writeInt64(val);
811
0
        } else {
812
0
            err_ = JsonbErrType::E_HEX_OVERFLOW;
813
0
            return false;
814
0
        }
815
816
0
        if (size == 0) {
817
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
818
0
            return false;
819
0
        }
820
821
0
        return true;
822
0
    }
823
824
    // parse a number in octal format
825
0
    bool parseOctal(std::istream& in) {
826
0
        int64_t val = 0;
827
0
        char ch = in.peek();
828
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
829
0
            if (ch >= '0' && ch <= '7') {
830
0
                val = val * 8 + (ch - '0');
831
0
            } else {
832
0
                err_ = JsonbErrType::E_INVALID_OCTAL;
833
0
                return false;
834
0
            }
835
836
            // check if the number overflows
837
0
            if (val < 0) {
838
0
                err_ = JsonbErrType::E_OCTAL_OVERFLOW;
839
0
                return false;
840
0
            }
841
842
0
            skipChar(in);
843
0
            ch = in.peek();
844
0
        }
845
846
0
        int size = 0;
847
0
        if (val <= std::numeric_limits<int8_t>::max()) {
848
0
            size = writer_.writeInt8((int8_t)val);
849
0
        } else if (val <= std::numeric_limits<int16_t>::max()) {
850
0
            size = writer_.writeInt16((int16_t)val);
851
0
        } else if (val <= std::numeric_limits<int32_t>::max()) {
852
0
            size = writer_.writeInt32((int32_t)val);
853
0
        } else { // val <= INT64_MAX
854
0
            size = writer_.writeInt64(val);
855
0
        }
856
857
0
        if (size == 0) {
858
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
859
0
            return false;
860
0
        }
861
862
0
        return true;
863
0
    }
864
865
    // parse a number in decimal (including float)
866
1.15k
    bool parseDecimal(std::istream& in, bool neg = false) {
867
1.15k
        char ch = 0;
868
1.15k
        while (in.good() && (ch = in.peek()) == '0') skipChar(in);
869
870
1.15k
        char* pbuf = num_buf_;
871
1.15k
        if (neg) *(pbuf++) = '-';
872
873
1.15k
        char* save_pos = pbuf;
874
5.18k
        while (in.good() && !strchr(kJsonDelim, ch)) {
875
4.40k
            *(pbuf++) = ch;
876
4.40k
            if (pbuf == end_buf_) {
877
0
                err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
878
0
                return false;
879
0
            }
880
881
4.40k
            if (ch == '.') {
882
282
                skipChar(in); // remove '.'
883
282
                return parseDouble(in, pbuf);
884
4.12k
            } else if (ch == 'E' || ch == 'e') {
885
0
                skipChar(in); // remove 'E'
886
0
                return parseExponent(in, pbuf);
887
4.12k
            } else if (ch < '0' || ch > '9') {
888
92
                err_ = JsonbErrType::E_INVALID_DECIMAL;
889
92
                return false;
890
92
            }
891
892
4.02k
            skipChar(in);
893
4.02k
            ch = in.peek();
894
4.02k
        }
895
777
        if (save_pos == pbuf) {
896
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
897
0
            return false;
898
0
        }
899
900
777
        *pbuf = 0; // set null-terminator
901
777
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
902
777
        int128_t val =
903
777
                StringParser::string_to_int<int128_t>(num_buf_, pbuf - num_buf_, &parse_result);
904
777
        if (parse_result != StringParser::PARSE_SUCCESS) {
905
0
            VLOG_ROW << "debug string_to_int error for " << num_buf_ << " val=" << val
906
0
                     << " parse_result=" << parse_result;
907
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
908
0
            return false;
909
0
        }
910
911
777
        int size = 0;
912
777
        if (val >= std::numeric_limits<int8_t>::min() &&
913
777
            val <= std::numeric_limits<int8_t>::max()) {
914
357
            size = writer_.writeInt8((int8_t)val);
915
420
        } else if (val >= std::numeric_limits<int16_t>::min() &&
916
420
                   val <= std::numeric_limits<int16_t>::max()) {
917
294
            size = writer_.writeInt16((int16_t)val);
918
294
        } else if (val >= std::numeric_limits<int32_t>::min() &&
919
126
                   val <= std::numeric_limits<int32_t>::max()) {
920
63
            size = writer_.writeInt32((int32_t)val);
921
63
        } else if (val >= std::numeric_limits<int64_t>::min() &&
922
63
                   val <= std::numeric_limits<int64_t>::max()) {
923
63
            size = writer_.writeInt64((int64_t)val);
924
63
        } else { // INT128
925
0
            size = writer_.writeInt128(val);
926
0
        }
927
928
777
        if (size == 0) {
929
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
930
0
            return false;
931
0
        }
932
933
777
        return true;
934
777
    }
935
936
    // parse IEEE745 double precision
937
282
    bool parseDouble(std::istream& in, char* pbuf) {
938
282
        char* save_pos = pbuf;
939
282
        char ch = in.peek();
940
824
        while (in.good() && !strchr(kJsonDelim, ch)) {
941
553
            *(pbuf++) = ch;
942
553
            if (pbuf == end_buf_) {
943
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
944
0
                return false;
945
0
            }
946
947
553
            if (ch == 'e' || ch == 'E') {
948
0
                skipChar(in); // remove 'E'
949
0
                return parseExponent(in, pbuf);
950
553
            } else if (ch < '0' || ch > '9') {
951
11
                err_ = JsonbErrType::E_INVALID_DECIMAL;
952
11
                return false;
953
11
            }
954
955
542
            skipChar(in);
956
542
            ch = in.peek();
957
542
        }
958
271
        if (save_pos == pbuf) {
959
0
            err_ = JsonbErrType::E_INVALID_DECIMAL; // empty input
960
0
            return false;
961
0
        }
962
963
271
        *pbuf = 0; // set null-terminator
964
271
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
965
271
    }
966
967
    // parse the exponent part of a double number
968
0
    bool parseExponent(std::istream& in, char* pbuf) {
969
0
        char ch = in.peek();
970
0
        if (in.good()) {
971
0
            if (ch == '+' || ch == '-') {
972
0
                *(pbuf++) = ch;
973
0
                if (pbuf == end_buf_) {
974
0
                    err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
975
0
                    return false;
976
0
                }
977
0
                skipChar(in);
978
0
                ch = in.peek();
979
0
            }
980
0
        }
981
982
0
        char* save_pos = pbuf;
983
0
        while (in.good() && !strchr(kJsonDelim, ch)) {
984
0
            *(pbuf++) = ch;
985
0
            if (pbuf == end_buf_) {
986
0
                err_ = JsonbErrType::E_DOUBLE_OVERFLOW;
987
0
                return false;
988
0
            }
989
990
0
            if (ch < '0' || ch > '9') {
991
0
                err_ = JsonbErrType::E_INVALID_EXPONENT;
992
0
                return false;
993
0
            }
994
995
0
            skipChar(in);
996
0
            ch = in.peek();
997
0
        }
998
0
        if (save_pos == pbuf) {
999
0
            err_ = JsonbErrType::E_INVALID_EXPONENT; // empty input
1000
0
            return false;
1001
0
        }
1002
1003
0
        *pbuf = 0; // set null-terminator
1004
0
        return internConvertBufferToDouble(num_buf_, pbuf - num_buf_);
1005
0
    }
1006
1007
    // call system function to parse double to string
1008
271
    bool internConvertBufferToDouble(char* num_buf_, int len) {
1009
271
        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
1010
271
        double val = StringParser::string_to_float<double>(num_buf_, len, &parse_result);
1011
271
        if (parse_result != StringParser::PARSE_SUCCESS) {
1012
0
            VLOG_ROW << "debug string_to_float error for " << num_buf_ << " val=" << val
1013
0
                     << " parse_result=" << parse_result;
1014
0
            err_ = JsonbErrType::E_DECIMAL_OVERFLOW;
1015
0
            return false;
1016
0
        }
1017
1018
271
        if (writer_.writeDouble(val) == 0) {
1019
0
            err_ = JsonbErrType::E_OUTPUT_FAIL;
1020
0
            return false;
1021
0
        }
1022
1023
271
        return true;
1024
271
    }
1025
1026
6.49k
    void trim(std::istream& in) {
1027
7.79k
        while (in.good() && strchr(kWhiteSpace, in.peek())) {
1028
1.30k
            skipChar(in);
1029
1.30k
        }
1030
6.49k
    }
1031
1032
    /*
1033
   * Helper functions to keep track of characters read.
1034
   * Do not rely on std::istream's tellg() which may not be implemented.
1035
   */
1036
1037
6.86k
    char nextChar(std::istream& in) {
1038
6.86k
        ++stream_pos_;
1039
6.86k
        return in.get();
1040
6.86k
    }
1041
1042
8.36k
    void skipChar(std::istream& in) {
1043
8.36k
        ++stream_pos_;
1044
8.36k
        in.ignore();
1045
8.36k
    }
1046
1047
private:
1048
    JsonbWriterT<OS_TYPE> writer_;
1049
    uint32_t stream_pos_;
1050
    JsonbErrType err_;
1051
    char num_buf_[512]; // buffer to hold number string
1052
    const char* end_buf_ = num_buf_ + sizeof(num_buf_) - 1;
1053
    uint32_t nesting_lvl_ = 0;
1054
};
1055
1056
typedef JsonbParserT<JsonbOutStream> JsonbParser;
1057
1058
} // namespace doris
1059
1060
#endif // JSONB_JSONBJSONPARSER_H