Coverage Report

Created: 2026-03-14 18:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/jsonb_parser_simd.h
Line
Count
Source
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserTSIMD (template) and JsonbParser.
13
 *
14
 * JsonbParserTSIMD is a template class which implements a JSON parser.
15
 * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#pragma once
60
#include <simdjson.h>
61
62
#include <cmath>
63
#include <limits>
64
65
#include "common/status.h"
66
#include "util/jsonb_document.h"
67
#include "util/jsonb_writer.h"
68
#include "util/string_parser.hpp"
69
70
namespace doris {
71
#include "common/compile_check_begin.h"
72
using int128_t = __int128;
73
struct JsonbParser {
74
    // According to https://github.com/simdjson/simdjson/pull/2139
75
    // For numbers larger than 64 bits, we can obtain the raw_json_token and parse it ourselves.
76
    // This allows handling numbers larger than 64 bits, such as int128.
77
    // For example, try to parse a 18446744073709551616, this number is just 1 greater than the maximum value of uint64_t, and simdjson will return a NUMBER_ERROR
78
    // If try to parse a 18446744073709551616231231, it is obviously a large integer, at this time simdjson will return a BIGINT_ERROR
79
339k
    static bool parse_number_success(simdjson::error_code error_code) {
80
339k
        return error_code == simdjson::error_code::SUCCESS ||
81
339k
               error_code == simdjson::error_code::NUMBER_ERROR ||
82
339k
               error_code == simdjson::error_code::BIGINT_ERROR;
83
339k
    }
84
85
    // parse a UTF-8 JSON string with length
86
    // will reset writer before parse
87
55.3k
    static Status parse(const char* pch, size_t len, JsonbWriter& writer) {
88
55.3k
        if (!pch || len == 0) {
89
21
            return Status::InvalidArgument("Empty JSON document");
90
21
        }
91
55.3k
        writer.reset();
92
55.3k
        try {
93
55.3k
            simdjson::ondemand::parser simdjson_parser;
94
55.3k
            simdjson::padded_string json_str {pch, len};
95
55.3k
            simdjson::ondemand::document doc = simdjson_parser.iterate(json_str);
96
97
            // simdjson process top level primitive types specially
98
            // so some repeated code here
99
55.3k
            switch (doc.type()) {
100
43.1k
            case simdjson::ondemand::json_type::object:
101
53.1k
            case simdjson::ondemand::json_type::array: {
102
53.1k
                RETURN_IF_ERROR(parse(doc.get_value(), writer));
103
52.8k
                break;
104
53.1k
            }
105
52.8k
            case simdjson::ondemand::json_type::null: {
106
122
                if (writer.writeNull() == 0) {
107
0
                    return Status::InvalidArgument("writeNull failed");
108
0
                }
109
122
                break;
110
122
            }
111
135
            case simdjson::ondemand::json_type::boolean: {
112
135
                if (writer.writeBool(doc.get_bool()) == 0) {
113
0
                    return Status::InvalidArgument("writeBool failed");
114
0
                }
115
135
                break;
116
135
            }
117
335
            case simdjson::ondemand::json_type::string: {
118
335
                RETURN_IF_ERROR(write_string(doc.get_string(), writer));
119
335
                break;
120
335
            }
121
545
            case simdjson::ondemand::json_type::number: {
122
545
                simdjson::ondemand::number num;
123
545
                simdjson::error_code res = doc.get_number().get(num);
124
545
                if (!parse_number_success(res)) {
125
0
                    return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
126
0
                                                               simdjson::error_message(res)));
127
0
                }
128
                // simdjson get_number() returns a number object, which can be
129
545
                RETURN_IF_ERROR(
130
545
                        write_number(num, doc.get_number_type(), doc.raw_json_token(), writer));
131
487
                break;
132
545
            }
133
55.3k
            }
134
55.3k
        } catch (simdjson::simdjson_error& e) {
135
1.17k
            return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what()));
136
1.17k
        }
137
53.8k
        return Status::OK();
138
55.3k
    }
139
140
private:
141
    // parse json, recursively if necessary, by simdjson
142
    //  and serialize to binary format by writer
143
877k
    static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) {
144
877k
        switch (value.type()) {
145
11.8k
        case simdjson::ondemand::json_type::null: {
146
11.8k
            if (writer.writeNull() == 0) {
147
0
                return Status::InvalidArgument("writeNull failed");
148
0
            }
149
11.8k
            break;
150
11.8k
        }
151
21.2k
        case simdjson::ondemand::json_type::boolean: {
152
21.2k
            if (writer.writeBool(value.get_bool()) == 0) {
153
0
                return Status::InvalidArgument("writeBool failed");
154
0
            }
155
21.2k
            break;
156
21.2k
        }
157
374k
        case simdjson::ondemand::json_type::string: {
158
374k
            RETURN_IF_ERROR(write_string(value.get_string(), writer));
159
374k
            break;
160
374k
        }
161
374k
        case simdjson::ondemand::json_type::number: {
162
339k
            simdjson::ondemand::number num;
163
339k
            auto res = value.get_number().get(num);
164
339k
            if (!parse_number_success(res)) {
165
0
                return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
166
0
                                                           simdjson::error_message(res)));
167
0
            }
168
169
339k
            RETURN_IF_ERROR(
170
339k
                    write_number(num, value.get_number_type(), value.raw_json_token(), writer));
171
339k
            break;
172
339k
        }
173
339k
        case simdjson::ondemand::json_type::object: {
174
59.6k
            if (!writer.writeStartObject()) {
175
0
                return Status::InvalidArgument("writeStartObject failed");
176
0
            }
177
178
193k
            for (auto kv : value.get_object()) {
179
193k
                std::string_view key;
180
193k
                simdjson::error_code e = kv.unescaped_key().get(key);
181
193k
                if (e != simdjson::SUCCESS) {
182
41
                    return Status::InvalidArgument(fmt::format("simdjson get key failed: {}", e));
183
41
                }
184
185
                // write key
186
193k
                if (key.size() > std::numeric_limits<uint8_t>::max()) {
187
268
                    return Status::InvalidArgument("key size exceeds max limit: {} , {}",
188
268
                                                   key.size(), std::numeric_limits<uint8_t>::max());
189
268
                }
190
192k
                if (!writer.writeKey(key.data(), (uint8_t)key.size())) {
191
0
                    return Status::InvalidArgument("writeKey failed : {}", key);
192
0
                }
193
194
                // parse object value
195
192k
                RETURN_IF_ERROR(parse(kv.value(), writer));
196
192k
            }
197
198
59.2k
            if (!writer.writeEndObject()) {
199
0
                return Status::InvalidArgument("writeEndObject failed");
200
0
                break;
201
0
            }
202
203
59.2k
            break;
204
59.2k
        }
205
72.6k
        case simdjson::ondemand::json_type::array: {
206
72.6k
            if (!writer.writeStartArray()) {
207
0
                return Status::InvalidArgument("writeStartArray failed");
208
0
            }
209
210
633k
            for (auto elem : value.get_array()) {
211
                // parse array element
212
633k
                RETURN_IF_ERROR(parse(elem.value(), writer));
213
633k
            }
214
215
72.4k
            if (!writer.writeEndArray()) {
216
0
                return Status::InvalidArgument("writeEndArray failed");
217
0
            }
218
72.4k
            break;
219
72.4k
        }
220
72.4k
        default: {
221
0
            return Status::InvalidArgument("unknown value type: ");
222
72.4k
        }
223
224
877k
        } // end of switch
225
877k
        return Status::OK();
226
877k
    }
227
228
374k
    static Status write_string(std::string_view str, JsonbWriter& writer) {
229
        // start writing string
230
374k
        if (!writer.writeStartString()) {
231
0
            return Status::InvalidArgument("writeStartString failed");
232
0
        }
233
234
        // write string
235
374k
        if (str.size() > 0) {
236
363k
            if (writer.writeString(str.data(), str.size()) == 0) {
237
0
                return Status::InvalidArgument("writeString failed");
238
0
            }
239
363k
        }
240
241
        // end writing string
242
374k
        if (!writer.writeEndString()) {
243
0
            return Status::InvalidArgument("writeEndString failed");
244
0
        }
245
374k
        return Status::OK();
246
374k
    }
247
248
    static Status write_number(simdjson::ondemand::number num,
249
                               simdjson ::ondemand::number_type num_type,
250
339k
                               std::string_view raw_string, JsonbWriter& writer) {
251
        // The simdjson library supports four types of numbers:
252
        // 1. floating_point_number: A binary64 number, which will be converted to jsonb's double type.
253
        // 2. signed_integer: A signed integer that fits in a 64-bit word using two's complement.
254
        // 3. unsigned_integer: A positive integer larger or equal to 1<<63.
255
        //    For these two integer types, we will convert them to jsonb's int8/int16/int32/int64/int128 types according to the specific value.
256
        // 4. big_integer: An integer that does not fit in a 64-bit word.
257
        //    For this type, simdjson cannot handle it directly. We first try to convert it to jsonb's int128 type.
258
        //    If conversion fails, we attempt to convert it to a double type.
259
        //    If conversion to double also fails, an error is returned.
260
261
339k
        switch (num_type) {
262
138k
        case simdjson::ondemand::number_type::floating_point_number: {
263
138k
            double number = num.get_double();
264
            // When a double exceeds the precision that can be represented by a double type in simdjson, it gets converted to 0.
265
            // The correct approach, should be to truncate the double value instead.
266
138k
            if (number == 0) {
267
13.7k
                StringParser::ParseResult result;
268
13.7k
                number = StringParser::string_to_float<double>(raw_string.data(), raw_string.size(),
269
13.7k
                                                               &result);
270
13.7k
                if (result != StringParser::PARSE_SUCCESS) {
271
59
                    return Status::InvalidArgument("invalid number, raw string is: " +
272
59
                                                   std::string(raw_string));
273
59
                }
274
13.7k
            }
275
276
138k
            if (writer.writeDouble(number) == 0) {
277
0
                return Status::InvalidArgument("writeDouble failed");
278
0
            }
279
280
138k
            break;
281
138k
        }
282
200k
        case simdjson::ondemand::number_type::signed_integer:
283
201k
        case simdjson::ondemand::number_type::unsigned_integer: {
284
201k
            int128_t val = num.is_int64() ? (int128_t)num.get_int64() : (int128_t)num.get_uint64();
285
201k
            bool success = false;
286
201k
            if (val >= std::numeric_limits<int8_t>::min() &&
287
201k
                val <= std::numeric_limits<int8_t>::max()) {
288
73.8k
                success = writer.writeInt8((int8_t)val);
289
127k
            } else if (val >= std::numeric_limits<int16_t>::min() &&
290
127k
                       val <= std::numeric_limits<int16_t>::max()) {
291
50.6k
                success = writer.writeInt16((int16_t)val);
292
76.4k
            } else if (val >= std::numeric_limits<int32_t>::min() &&
293
76.4k
                       val <= std::numeric_limits<int32_t>::max()) {
294
8.19k
                success = writer.writeInt32((int32_t)val);
295
68.2k
            } else if (val >= std::numeric_limits<int64_t>::min() &&
296
68.4k
                       val <= std::numeric_limits<int64_t>::max()) {
297
68.4k
                success = writer.writeInt64((int64_t)val);
298
18.4E
            } else { // INT128
299
18.4E
                success = writer.writeInt128(val);
300
18.4E
            }
301
302
201k
            if (!success) {
303
0
                return Status::InvalidArgument("writeInt failed");
304
0
            }
305
201k
            break;
306
201k
        }
307
201k
        case simdjson::ondemand::number_type::big_integer: {
308
12
            StringParser::ParseResult result;
309
12
            auto val = StringParser::string_to_int<int128_t>(raw_string.data(), raw_string.size(),
310
12
                                                             &result);
311
12
            if (result != StringParser::PARSE_SUCCESS) {
312
                // If the string exceeds the range of int128_t, it will attempt to convert it to double.
313
                // This may result in loss of precision, but for JSON, exchanging data as plain text between different systems may inherently cause precision loss.
314
                // try parse as double
315
2
                double double_val = StringParser::string_to_float<double>(
316
2
                        raw_string.data(), raw_string.size(), &result);
317
2
                if (result != StringParser::PARSE_SUCCESS) {
318
                    // if both parse failed, return error
319
0
                    return Status::InvalidArgument("invalid number, raw string is: " +
320
0
                                                   std::string(raw_string));
321
0
                }
322
2
                if (!writer.writeDouble(double_val)) {
323
0
                    return Status::InvalidArgument("writeDouble failed");
324
0
                }
325
10
            } else {
326
                // as int128_t
327
10
                if (!writer.writeInt128(val)) {
328
0
                    return Status::InvalidArgument("writeInt128 failed");
329
0
                }
330
10
            }
331
12
            break;
332
12
        }
333
339k
        }
334
340k
        return Status::OK();
335
339k
    }
336
};
337
#include "common/compile_check_end.h"
338
} // namespace doris