Coverage Report

Created: 2026-05-21 12:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/jsonb_parser_simd.h
Line
Count
Source
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserTSIMD (template) and JsonbParser.
13
 *
14
 * JsonbParserTSIMD is a template class which implements a JSON parser.
15
 * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#pragma once
60
#include <simdjson.h>
61
62
#include <cmath>
63
#include <limits>
64
65
#include "common/status.h"
66
#include "util/jsonb_document.h"
67
#include "util/jsonb_writer.h"
68
#include "util/string_parser.hpp"
69
70
namespace doris {
71
using int128_t = __int128;
72
struct JsonbParser {
73
    // parse a UTF-8 JSON string with length
74
    // will reset writer before parse
75
2.69k
    static Status parse(const char* pch, size_t len, JsonbWriter& writer) {
76
2.69k
        if (!pch || len == 0) {
77
11
            return Status::InvalidArgument("Empty JSON document");
78
11
        }
79
2.68k
        writer.reset();
80
2.68k
        try {
81
2.68k
            simdjson::ondemand::parser simdjson_parser;
82
2.68k
            simdjson::padded_string json_str {pch, len};
83
2.68k
            simdjson::ondemand::document doc = simdjson_parser.iterate(json_str);
84
85
            // simdjson process top level primitive types specially
86
            // so some repeated code here
87
2.68k
            bool need_check_at_end = true;
88
2.68k
            switch (doc.type()) {
89
417
            case simdjson::ondemand::json_type::object:
90
1.97k
            case simdjson::ondemand::json_type::array: {
91
1.97k
                RETURN_IF_ERROR(parse(doc.get_value(), writer));
92
1.70k
                break;
93
1.97k
            }
94
1.70k
            case simdjson::ondemand::json_type::null: {
95
57
                bool is_null = false;
96
57
                simdjson::error_code res = doc.is_null().get(is_null);
97
57
                if (res != simdjson::SUCCESS || !is_null) {
98
6
                    return Status::InvalidArgument(fmt::format("simdjson get null failed: {}",
99
6
                                                               simdjson::error_message(res)));
100
6
                }
101
51
                if (writer.writeNull() == 0) {
102
0
                    return Status::InvalidArgument("writeNull failed");
103
0
                }
104
51
                break;
105
51
            }
106
72
            case simdjson::ondemand::json_type::boolean: {
107
72
                if (writer.writeBool(doc.get_bool()) == 0) {
108
0
                    return Status::InvalidArgument("writeBool failed");
109
0
                }
110
72
                break;
111
72
            }
112
300
            case simdjson::ondemand::json_type::string: {
113
300
                RETURN_IF_ERROR(write_string(doc.get_string(), writer));
114
300
                break;
115
300
            }
116
300
            case simdjson::ondemand::json_type::number: {
117
264
                simdjson::ondemand::number num;
118
264
                simdjson::error_code res = doc.get_number().get(num);
119
264
                if (res == simdjson::error_code::SUCCESS) {
120
248
                    RETURN_IF_ERROR(
121
248
                            write_number(num, doc.get_number_type(), doc.raw_json_token(), writer));
122
248
                    break;
123
248
                }
124
16
                if (res == simdjson::error_code::NUMBER_ERROR ||
125
16
                    res == simdjson::error_code::BIGINT_ERROR) {
126
16
                    RETURN_IF_ERROR(write_number_from_raw_json(pch, len, writer));
127
7
                    need_check_at_end = false;
128
7
                    break;
129
16
                }
130
0
                return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
131
0
                                                           simdjson::error_message(res)));
132
16
            }
133
2.68k
            }
134
2.36k
            if (need_check_at_end && !doc.at_end()) {
135
0
                return Status::InvalidArgument("simdjson parse exception: trailing content");
136
0
            }
137
2.36k
        } catch (simdjson::simdjson_error& e) {
138
30
            return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what()));
139
30
        }
140
2.36k
        return Status::OK();
141
2.68k
    }
142
143
private:
144
    // parse json, recursively if necessary, by simdjson
145
    //  and serialize to binary format by writer
146
561k
    static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) {
147
561k
        switch (value.type()) {
148
11.7k
        case simdjson::ondemand::json_type::null: {
149
11.7k
            if (writer.writeNull() == 0) {
150
0
                return Status::InvalidArgument("writeNull failed");
151
0
            }
152
11.7k
            break;
153
11.7k
        }
154
20.5k
        case simdjson::ondemand::json_type::boolean: {
155
20.5k
            if (writer.writeBool(value.get_bool()) == 0) {
156
0
                return Status::InvalidArgument("writeBool failed");
157
0
            }
158
20.5k
            break;
159
20.5k
        }
160
250k
        case simdjson::ondemand::json_type::string: {
161
250k
            RETURN_IF_ERROR(write_string(value.get_string(), writer));
162
250k
            break;
163
250k
        }
164
250k
        case simdjson::ondemand::json_type::number: {
165
224k
            simdjson::ondemand::number num;
166
224k
            auto res = value.get_number().get(num);
167
224k
            if (res == simdjson::error_code::SUCCESS) {
168
224k
                RETURN_IF_ERROR(
169
224k
                        write_number(num, value.get_number_type(), value.raw_json_token(), writer));
170
224k
                break;
171
224k
            }
172
4
            if (res == simdjson::error_code::NUMBER_ERROR ||
173
4
                res == simdjson::error_code::BIGINT_ERROR) {
174
4
                RETURN_IF_ERROR(write_number_from_raw_json(value.raw_json_token(), writer));
175
1
                break;
176
4
            }
177
0
            return Status::InvalidArgument(
178
0
                    fmt::format("simdjson get_number failed: {}", simdjson::error_message(res)));
179
4
        }
180
2.67k
        case simdjson::ondemand::json_type::object: {
181
2.67k
            if (!writer.writeStartObject()) {
182
0
                return Status::InvalidArgument("writeStartObject failed");
183
0
            }
184
185
6.20k
            for (auto kv : value.get_object()) {
186
6.20k
                std::string_view key;
187
6.20k
                simdjson::error_code e = kv.unescaped_key().get(key);
188
6.20k
                if (e != simdjson::SUCCESS) {
189
2
                    return Status::InvalidArgument(fmt::format("simdjson get key failed: {}", e));
190
2
                }
191
192
                // write key
193
6.19k
                if (key.size() > std::numeric_limits<uint8_t>::max()) {
194
268
                    return Status::InvalidArgument("key size exceeds max limit: {} , {}",
195
268
                                                   key.size(), std::numeric_limits<uint8_t>::max());
196
268
                }
197
5.93k
                if (!writer.writeKey(key.data(), (uint8_t)key.size())) {
198
0
                    return Status::InvalidArgument("writeKey failed : {}", key);
199
0
                }
200
201
                // parse object value
202
5.93k
                RETURN_IF_ERROR(parse(kv.value(), writer));
203
5.93k
            }
204
205
2.31k
            if (!writer.writeEndObject()) {
206
0
                return Status::InvalidArgument("writeEndObject failed");
207
0
                break;
208
0
            }
209
210
2.31k
            break;
211
2.31k
        }
212
51.8k
        case simdjson::ondemand::json_type::array: {
213
51.8k
            if (!writer.writeStartArray()) {
214
0
                return Status::InvalidArgument("writeStartArray failed");
215
0
            }
216
217
553k
            for (auto elem : value.get_array()) {
218
                // parse array element
219
553k
                RETURN_IF_ERROR(parse(elem.value(), writer));
220
553k
            }
221
222
51.6k
            if (!writer.writeEndArray()) {
223
0
                return Status::InvalidArgument("writeEndArray failed");
224
0
            }
225
51.6k
            break;
226
51.6k
        }
227
51.6k
        default: {
228
0
            return Status::InvalidArgument("unknown value type: ");
229
51.6k
        }
230
231
561k
        } // end of switch
232
560k
        return Status::OK();
233
561k
    }
234
235
250k
    static Status write_string(std::string_view str, JsonbWriter& writer) {
236
        // start writing string
237
250k
        if (!writer.writeStartString()) {
238
0
            return Status::InvalidArgument("writeStartString failed");
239
0
        }
240
241
        // write string
242
250k
        if (str.size() > 0) {
243
239k
            if (writer.writeString(str.data(), str.size()) == 0) {
244
0
                return Status::InvalidArgument("writeString failed");
245
0
            }
246
239k
        }
247
248
        // end writing string
249
250k
        if (!writer.writeEndString()) {
250
0
            return Status::InvalidArgument("writeEndString failed");
251
0
        }
252
250k
        return Status::OK();
253
250k
    }
254
255
    static Status write_number(simdjson::ondemand::number num,
256
                               simdjson ::ondemand::number_type num_type,
257
224k
                               std::string_view raw_string, JsonbWriter& writer) {
258
        // The simdjson library supports four types of numbers:
259
        // 1. floating_point_number: A binary64 number, which will be converted to jsonb's double type.
260
        // 2. signed_integer: A signed integer that fits in a 64-bit word using two's complement.
261
        // 3. unsigned_integer: A positive integer larger or equal to 1<<63.
262
        //    For these two integer types, we will convert them to jsonb's int8/int16/int32/int64/int128 types according to the specific value.
263
        // 4. big_integer: An integer that does not fit in a 64-bit word.
264
        //    For this type, simdjson cannot handle it directly. We first try to convert it to jsonb's int128 type.
265
        //    If conversion fails, we attempt to convert it to a double type.
266
        //    If conversion to double also fails, an error is returned.
267
268
224k
        switch (num_type) {
269
115k
        case simdjson::ondemand::number_type::floating_point_number: {
270
115k
            double number = num.get_double();
271
            // When a double exceeds the precision that can be represented by a double type in simdjson, it gets converted to 0.
272
            // The correct approach, should be to truncate the double value instead.
273
115k
            if (number == 0) {
274
13.7k
                StringParser::ParseResult result;
275
13.7k
                number = StringParser::string_to_float<double>(raw_string.data(), raw_string.size(),
276
13.7k
                                                               &result);
277
13.7k
                if (result != StringParser::PARSE_SUCCESS) {
278
0
                    return Status::InvalidArgument("invalid number, raw string is: " +
279
0
                                                   std::string(raw_string));
280
0
                }
281
13.7k
            }
282
115k
            if (!std::isfinite(number)) {
283
0
                return Status::InvalidArgument("invalid number, raw string is: " +
284
0
                                               std::string(raw_string));
285
0
            }
286
287
115k
            if (writer.writeDouble(number) == 0) {
288
0
                return Status::InvalidArgument("writeDouble failed");
289
0
            }
290
291
115k
            break;
292
115k
        }
293
115k
        case simdjson::ondemand::number_type::signed_integer:
294
109k
        case simdjson::ondemand::number_type::unsigned_integer: {
295
109k
            int128_t val = num.is_int64() ? (int128_t)num.get_int64() : (int128_t)num.get_uint64();
296
109k
            RETURN_IF_ERROR(write_int128(val, writer));
297
109k
            break;
298
109k
        }
299
109k
        case simdjson::ondemand::number_type::big_integer: {
300
0
            RETURN_IF_ERROR(write_number_from_raw_json(raw_string, writer));
301
0
            break;
302
0
        }
303
224k
        }
304
224k
        return Status::OK();
305
224k
    }
306
307
42
    static bool is_json_number_space(char c) {
308
42
        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
309
42
    }
310
311
20
    static std::string_view trim_json_number(std::string_view raw_number) {
312
21
        while (!raw_number.empty() && is_json_number_space(raw_number.front())) {
313
1
            raw_number.remove_prefix(1);
314
1
        }
315
21
        while (!raw_number.empty() && is_json_number_space(raw_number.back())) {
316
1
            raw_number.remove_suffix(1);
317
1
        }
318
20
        return raw_number;
319
20
    }
320
321
1.08k
    static bool is_json_number_digit(char c) { return c >= '0' && c <= '9'; }
322
323
20
    static Status validate_json_number(std::string_view raw_number, bool& is_integer) {
324
20
        if (raw_number.empty()) {
325
0
            return Status::InvalidArgument("empty number");
326
0
        }
327
328
20
        size_t pos = 0;
329
20
        if (raw_number[pos] == '-') {
330
0
            ++pos;
331
0
            if (pos == raw_number.size()) {
332
0
                return Status::InvalidArgument("invalid number, raw string is: " +
333
0
                                               std::string(raw_number));
334
0
            }
335
0
        }
336
337
20
        if (raw_number[pos] == '0') {
338
0
            ++pos;
339
20
        } else if (raw_number[pos] >= '1' && raw_number[pos] <= '9') {
340
1.08k
            while (pos < raw_number.size() && is_json_number_digit(raw_number[pos])) {
341
1.06k
                ++pos;
342
1.06k
            }
343
20
        } else {
344
0
            return Status::InvalidArgument("invalid number, raw string is: " +
345
0
                                           std::string(raw_number));
346
0
        }
347
348
20
        bool has_fraction = false;
349
20
        if (pos < raw_number.size() && raw_number[pos] == '.') {
350
4
            has_fraction = true;
351
4
            ++pos;
352
4
            if (pos == raw_number.size() || !is_json_number_digit(raw_number[pos])) {
353
3
                return Status::InvalidArgument("invalid number, raw string is: " +
354
3
                                               std::string(raw_number));
355
3
            }
356
3
            while (pos < raw_number.size() && is_json_number_digit(raw_number[pos])) {
357
2
                ++pos;
358
2
            }
359
1
        }
360
361
17
        bool has_exponent = false;
362
17
        if (pos < raw_number.size() && (raw_number[pos] == 'e' || raw_number[pos] == 'E')) {
363
3
            has_exponent = true;
364
3
            ++pos;
365
3
            if (pos < raw_number.size() && (raw_number[pos] == '+' || raw_number[pos] == '-')) {
366
0
                ++pos;
367
0
            }
368
3
            if (pos == raw_number.size() || !is_json_number_digit(raw_number[pos])) {
369
1
                return Status::InvalidArgument("invalid number, raw string is: " +
370
1
                                               std::string(raw_number));
371
1
            }
372
10
            while (pos < raw_number.size() && is_json_number_digit(raw_number[pos])) {
373
8
                ++pos;
374
8
            }
375
2
        }
376
377
16
        if (pos != raw_number.size()) {
378
4
            return Status::InvalidArgument("simdjson parse exception: trailing content");
379
4
        }
380
12
        is_integer = !has_fraction && !has_exponent;
381
12
        return Status::OK();
382
16
    }
383
384
109k
    static Status write_int128(int128_t val, JsonbWriter& writer) {
385
109k
        bool success = false;
386
109k
        if (val >= std::numeric_limits<int8_t>::min() &&
387
109k
            val <= std::numeric_limits<int8_t>::max()) {
388
48.0k
            success = writer.writeInt8((int8_t)val);
389
61.1k
        } else if (val >= std::numeric_limits<int16_t>::min() &&
390
61.1k
                   val <= std::numeric_limits<int16_t>::max()) {
391
97
            success = writer.writeInt16((int16_t)val);
392
61.0k
        } else if (val >= std::numeric_limits<int32_t>::min() &&
393
61.0k
                   val <= std::numeric_limits<int32_t>::max()) {
394
30
            success = writer.writeInt32((int32_t)val);
395
60.9k
        } else if (val >= std::numeric_limits<int64_t>::min() &&
396
60.9k
                   val <= std::numeric_limits<int64_t>::max()) {
397
60.9k
            success = writer.writeInt64((int64_t)val);
398
60.9k
        } else { // INT128
399
18
            success = writer.writeInt128(val);
400
18
        }
401
402
109k
        if (!success) {
403
0
            return Status::InvalidArgument("writeInt failed");
404
0
        }
405
109k
        return Status::OK();
406
109k
    }
407
408
16
    static Status write_number_from_raw_json(const char* pch, size_t len, JsonbWriter& writer) {
409
16
        return write_number_from_raw_json(std::string_view(pch, len), writer);
410
16
    }
411
412
    // According to https://github.com/simdjson/simdjson/pull/2139
413
    // For numbers larger than 64 bits, we can obtain the raw_json_token and parse it ourselves.
414
    // This allows handling numbers larger than 64 bits, such as int128.
415
20
    static Status write_number_from_raw_json(std::string_view raw_number, JsonbWriter& writer) {
416
20
        raw_number = trim_json_number(raw_number);
417
20
        bool is_integer = false;
418
20
        RETURN_IF_ERROR(validate_json_number(raw_number, is_integer));
419
420
12
        StringParser::ParseResult result;
421
12
        if (is_integer) {
422
10
            auto val = StringParser::string_to_int<int128_t>(raw_number.data(), raw_number.size(),
423
10
                                                             &result);
424
10
            if (result == StringParser::PARSE_SUCCESS) {
425
6
                RETURN_IF_ERROR(write_int128(val, writer));
426
6
                return Status::OK();
427
6
            }
428
10
        }
429
430
        // If the string exceeds the range of int128_t, it will attempt to convert it to double.
431
        // This may result in loss of precision, but for JSON, exchanging data as plain text
432
        // between different systems may inherently cause precision loss.
433
6
        double double_val = StringParser::string_to_float<double>(raw_number.data(),
434
6
                                                                  raw_number.size(), &result);
435
6
        if (result != StringParser::PARSE_SUCCESS || !std::isfinite(double_val)) {
436
4
            return Status::InvalidArgument("invalid number, raw string is: " +
437
4
                                           std::string(raw_number));
438
4
        }
439
2
        if (!writer.writeDouble(double_val)) {
440
0
            return Status::InvalidArgument("writeDouble failed");
441
0
        }
442
2
        return Status::OK();
443
2
    }
444
};
445
} // namespace doris