Coverage Report

Created: 2026-04-11 13:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/jsonb_parser_simd.h
Line
Count
Source
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserTSIMD (template) and JsonbParser.
13
 *
14
 * JsonbParserTSIMD is a template class which implements a JSON parser.
15
 * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#pragma once
60
#include <simdjson.h>
61
62
#include <cmath>
63
#include <limits>
64
65
#include "common/status.h"
66
#include "util/jsonb_document.h"
67
#include "util/jsonb_writer.h"
68
#include "util/string_parser.hpp"
69
70
namespace doris {
71
using int128_t = __int128;
72
struct JsonbParser {
73
    // According to https://github.com/simdjson/simdjson/pull/2139
74
    // For numbers larger than 64 bits, we can obtain the raw_json_token and parse it ourselves.
75
    // This allows handling numbers larger than 64 bits, such as int128.
76
    // For example, try to parse a 18446744073709551616, this number is just 1 greater than the maximum value of uint64_t, and simdjson will return a NUMBER_ERROR
77
    // If try to parse a 18446744073709551616231231, it is obviously a large integer, at this time simdjson will return a BIGINT_ERROR
78
341k
    static bool parse_number_success(simdjson::error_code error_code) {
79
341k
        return error_code == simdjson::error_code::SUCCESS ||
80
341k
               error_code == simdjson::error_code::NUMBER_ERROR ||
81
341k
               error_code == simdjson::error_code::BIGINT_ERROR;
82
341k
    }
83
84
    // parse a UTF-8 JSON string with length
85
    // will reset writer before parse
86
56.8k
    static Status parse(const char* pch, size_t len, JsonbWriter& writer) {
87
56.8k
        if (!pch || len == 0) {
88
21
            return Status::InvalidArgument("Empty JSON document");
89
21
        }
90
56.8k
        writer.reset();
91
56.8k
        try {
92
56.8k
            simdjson::ondemand::parser simdjson_parser;
93
56.8k
            simdjson::padded_string json_str {pch, len};
94
56.8k
            simdjson::ondemand::document doc = simdjson_parser.iterate(json_str);
95
96
            // simdjson process top level primitive types specially
97
            // so some repeated code here
98
56.8k
            switch (doc.type()) {
99
43.4k
            case simdjson::ondemand::json_type::object:
100
54.6k
            case simdjson::ondemand::json_type::array: {
101
54.6k
                RETURN_IF_ERROR(parse(doc.get_value(), writer));
102
54.3k
                break;
103
54.6k
            }
104
54.3k
            case simdjson::ondemand::json_type::null: {
105
122
                if (writer.writeNull() == 0) {
106
0
                    return Status::InvalidArgument("writeNull failed");
107
0
                }
108
122
                break;
109
122
            }
110
135
            case simdjson::ondemand::json_type::boolean: {
111
135
                if (writer.writeBool(doc.get_bool()) == 0) {
112
0
                    return Status::InvalidArgument("writeBool failed");
113
0
                }
114
135
                break;
115
135
            }
116
336
            case simdjson::ondemand::json_type::string: {
117
336
                RETURN_IF_ERROR(write_string(doc.get_string(), writer));
118
336
                break;
119
336
            }
120
545
            case simdjson::ondemand::json_type::number: {
121
545
                simdjson::ondemand::number num;
122
545
                simdjson::error_code res = doc.get_number().get(num);
123
545
                if (!parse_number_success(res)) {
124
0
                    return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
125
0
                                                               simdjson::error_message(res)));
126
0
                }
127
                // simdjson get_number() returns a number object, which can be
128
545
                RETURN_IF_ERROR(
129
545
                        write_number(num, doc.get_number_type(), doc.raw_json_token(), writer));
130
487
                break;
131
545
            }
132
56.8k
            }
133
56.8k
        } catch (simdjson::simdjson_error& e) {
134
1.14k
            return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what()));
135
1.14k
        }
136
55.3k
        return Status::OK();
137
56.8k
    }
138
139
private:
140
    // parse json, recursively if necessary, by simdjson
141
    //  and serialize to binary format by writer
142
886k
    static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) {
143
886k
        switch (value.type()) {
144
12.0k
        case simdjson::ondemand::json_type::null: {
145
12.0k
            if (writer.writeNull() == 0) {
146
0
                return Status::InvalidArgument("writeNull failed");
147
0
            }
148
12.0k
            break;
149
12.0k
        }
150
21.2k
        case simdjson::ondemand::json_type::boolean: {
151
21.2k
            if (writer.writeBool(value.get_bool()) == 0) {
152
0
                return Status::InvalidArgument("writeBool failed");
153
0
            }
154
21.2k
            break;
155
21.2k
        }
156
376k
        case simdjson::ondemand::json_type::string: {
157
376k
            RETURN_IF_ERROR(write_string(value.get_string(), writer));
158
376k
            break;
159
376k
        }
160
376k
        case simdjson::ondemand::json_type::number: {
161
340k
            simdjson::ondemand::number num;
162
340k
            auto res = value.get_number().get(num);
163
340k
            if (!parse_number_success(res)) {
164
0
                return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
165
0
                                                           simdjson::error_message(res)));
166
0
            }
167
168
340k
            RETURN_IF_ERROR(
169
340k
                    write_number(num, value.get_number_type(), value.raw_json_token(), writer));
170
340k
            break;
171
340k
        }
172
340k
        case simdjson::ondemand::json_type::object: {
173
61.8k
            if (!writer.writeStartObject()) {
174
0
                return Status::InvalidArgument("writeStartObject failed");
175
0
            }
176
177
196k
            for (auto kv : value.get_object()) {
178
196k
                std::string_view key;
179
196k
                simdjson::error_code e = kv.unescaped_key().get(key);
180
196k
                if (e != simdjson::SUCCESS) {
181
41
                    return Status::InvalidArgument(fmt::format("simdjson get key failed: {}", e));
182
41
                }
183
184
                // write key
185
196k
                if (key.size() > std::numeric_limits<uint8_t>::max()) {
186
268
                    return Status::InvalidArgument("key size exceeds max limit: {} , {}",
187
268
                                                   key.size(), std::numeric_limits<uint8_t>::max());
188
268
                }
189
196k
                if (!writer.writeKey(key.data(), (uint8_t)key.size())) {
190
0
                    return Status::InvalidArgument("writeKey failed : {}", key);
191
0
                }
192
193
                // parse object value
194
196k
                RETURN_IF_ERROR(parse(kv.value(), writer));
195
196k
            }
196
197
61.4k
            if (!writer.writeEndObject()) {
198
0
                return Status::InvalidArgument("writeEndObject failed");
199
0
                break;
200
0
            }
201
202
61.4k
            break;
203
61.4k
        }
204
73.9k
        case simdjson::ondemand::json_type::array: {
205
73.9k
            if (!writer.writeStartArray()) {
206
0
                return Status::InvalidArgument("writeStartArray failed");
207
0
            }
208
209
636k
            for (auto elem : value.get_array()) {
210
                // parse array element
211
636k
                RETURN_IF_ERROR(parse(elem.value(), writer));
212
636k
            }
213
214
73.7k
            if (!writer.writeEndArray()) {
215
0
                return Status::InvalidArgument("writeEndArray failed");
216
0
            }
217
73.7k
            break;
218
73.7k
        }
219
73.7k
        default: {
220
0
            return Status::InvalidArgument("unknown value type: ");
221
73.7k
        }
222
223
886k
        } // end of switch
224
885k
        return Status::OK();
225
886k
    }
226
227
376k
    static Status write_string(std::string_view str, JsonbWriter& writer) {
228
        // start writing string
229
376k
        if (!writer.writeStartString()) {
230
0
            return Status::InvalidArgument("writeStartString failed");
231
0
        }
232
233
        // write string
234
376k
        if (str.size() > 0) {
235
365k
            if (writer.writeString(str.data(), str.size()) == 0) {
236
0
                return Status::InvalidArgument("writeString failed");
237
0
            }
238
365k
        }
239
240
        // end writing string
241
376k
        if (!writer.writeEndString()) {
242
0
            return Status::InvalidArgument("writeEndString failed");
243
0
        }
244
376k
        return Status::OK();
245
376k
    }
246
247
    static Status write_number(simdjson::ondemand::number num,
248
                               simdjson ::ondemand::number_type num_type,
249
341k
                               std::string_view raw_string, JsonbWriter& writer) {
250
        // The simdjson library supports four types of numbers:
251
        // 1. floating_point_number: A binary64 number, which will be converted to jsonb's double type.
252
        // 2. signed_integer: A signed integer that fits in a 64-bit word using two's complement.
253
        // 3. unsigned_integer: A positive integer larger or equal to 1<<63.
254
        //    For these two integer types, we will convert them to jsonb's int8/int16/int32/int64/int128 types according to the specific value.
255
        // 4. big_integer: An integer that does not fit in a 64-bit word.
256
        //    For this type, simdjson cannot handle it directly. We first try to convert it to jsonb's int128 type.
257
        //    If conversion fails, we attempt to convert it to a double type.
258
        //    If conversion to double also fails, an error is returned.
259
260
341k
        switch (num_type) {
261
138k
        case simdjson::ondemand::number_type::floating_point_number: {
262
138k
            double number = num.get_double();
263
            // When a double exceeds the precision that can be represented by a double type in simdjson, it gets converted to 0.
264
            // The correct approach, should be to truncate the double value instead.
265
138k
            if (number == 0) {
266
13.7k
                StringParser::ParseResult result;
267
13.7k
                number = StringParser::string_to_float<double>(raw_string.data(), raw_string.size(),
268
13.7k
                                                               &result);
269
13.7k
                if (result != StringParser::PARSE_SUCCESS) {
270
59
                    return Status::InvalidArgument("invalid number, raw string is: " +
271
59
                                                   std::string(raw_string));
272
59
                }
273
13.7k
            }
274
275
138k
            if (writer.writeDouble(number) == 0) {
276
0
                return Status::InvalidArgument("writeDouble failed");
277
0
            }
278
279
138k
            break;
280
138k
        }
281
202k
        case simdjson::ondemand::number_type::signed_integer:
282
202k
        case simdjson::ondemand::number_type::unsigned_integer: {
283
202k
            int128_t val = num.is_int64() ? (int128_t)num.get_int64() : (int128_t)num.get_uint64();
284
202k
            bool success = false;
285
202k
            if (val >= std::numeric_limits<int8_t>::min() &&
286
202k
                val <= std::numeric_limits<int8_t>::max()) {
287
75.2k
                success = writer.writeInt8((int8_t)val);
288
127k
            } else if (val >= std::numeric_limits<int16_t>::min() &&
289
127k
                       val <= std::numeric_limits<int16_t>::max()) {
290
50.8k
                success = writer.writeInt16((int16_t)val);
291
76.4k
            } else if (val >= std::numeric_limits<int32_t>::min() &&
292
76.4k
                       val <= std::numeric_limits<int32_t>::max()) {
293
8.09k
                success = writer.writeInt32((int32_t)val);
294
68.3k
            } else if (val >= std::numeric_limits<int64_t>::min() &&
295
68.5k
                       val <= std::numeric_limits<int64_t>::max()) {
296
68.4k
                success = writer.writeInt64((int64_t)val);
297
18.4E
            } else { // INT128
298
18.4E
                success = writer.writeInt128(val);
299
18.4E
            }
300
301
202k
            if (!success) {
302
0
                return Status::InvalidArgument("writeInt failed");
303
0
            }
304
202k
            break;
305
202k
        }
306
202k
        case simdjson::ondemand::number_type::big_integer: {
307
12
            StringParser::ParseResult result;
308
12
            auto val = StringParser::string_to_int<int128_t>(raw_string.data(), raw_string.size(),
309
12
                                                             &result);
310
12
            if (result != StringParser::PARSE_SUCCESS) {
311
                // If the string exceeds the range of int128_t, it will attempt to convert it to double.
312
                // This may result in loss of precision, but for JSON, exchanging data as plain text between different systems may inherently cause precision loss.
313
                // try parse as double
314
2
                double double_val = StringParser::string_to_float<double>(
315
2
                        raw_string.data(), raw_string.size(), &result);
316
2
                if (result != StringParser::PARSE_SUCCESS) {
317
                    // if both parse failed, return error
318
0
                    return Status::InvalidArgument("invalid number, raw string is: " +
319
0
                                                   std::string(raw_string));
320
0
                }
321
2
                if (!writer.writeDouble(double_val)) {
322
0
                    return Status::InvalidArgument("writeDouble failed");
323
0
                }
324
10
            } else {
325
                // as int128_t
326
10
                if (!writer.writeInt128(val)) {
327
0
                    return Status::InvalidArgument("writeInt128 failed");
328
0
                }
329
10
            }
330
12
            break;
331
12
        }
332
341k
        }
333
341k
        return Status::OK();
334
341k
    }
335
};
336
} // namespace doris