Coverage Report

Created: 2026-03-14 04:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/jsonb_parser_simd.h
Line
Count
Source
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserTSIMD (template) and JsonbParser.
13
 *
14
 * JsonbParserTSIMD is a template class which implements a JSON parser.
15
 * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#pragma once
60
#include <simdjson.h>
61
62
#include <cmath>
63
#include <limits>
64
65
#include "common/status.h"
66
#include "util/jsonb_document.h"
67
#include "util/jsonb_writer.h"
68
#include "util/string_parser.hpp"
69
70
namespace doris {
71
#include "common/compile_check_begin.h"
72
using int128_t = __int128;
73
struct JsonbParser {
74
    // According to https://github.com/simdjson/simdjson/pull/2139
75
    // For numbers larger than 64 bits, we can obtain the raw_json_token and parse it ourselves.
76
    // This allows handling numbers larger than 64 bits, such as int128.
77
    // For example, try to parse a 18446744073709551616, this number is just 1 greater than the maximum value of uint64_t, and simdjson will return a NUMBER_ERROR
78
    // If try to parse a 18446744073709551616231231, it is obviously a large integer, at this time simdjson will return a BIGINT_ERROR
79
224k
    static bool parse_number_success(simdjson::error_code error_code) {
80
224k
        return error_code == simdjson::error_code::SUCCESS ||
81
224k
               error_code == simdjson::error_code::NUMBER_ERROR ||
82
224k
               error_code == simdjson::error_code::BIGINT_ERROR;
83
224k
    }
84
85
    // parse a UTF-8 JSON string with length
86
    // will reset writer before parse
87
2.59k
    static Status parse(const char* pch, size_t len, JsonbWriter& writer) {
88
2.59k
        if (!pch || len == 0) {
89
11
            return Status::InvalidArgument("Empty JSON document");
90
11
        }
91
2.58k
        writer.reset();
92
2.58k
        try {
93
2.58k
            simdjson::ondemand::parser simdjson_parser;
94
2.58k
            simdjson::padded_string json_str {pch, len};
95
2.58k
            simdjson::ondemand::document doc = simdjson_parser.iterate(json_str);
96
97
            // simdjson process top level primitive types specially
98
            // so some repeated code here
99
2.58k
            switch (doc.type()) {
100
354
            case simdjson::ondemand::json_type::object:
101
1.89k
            case simdjson::ondemand::json_type::array: {
102
1.89k
                RETURN_IF_ERROR(parse(doc.get_value(), writer));
103
1.62k
                break;
104
1.89k
            }
105
1.62k
            case simdjson::ondemand::json_type::null: {
106
52
                if (writer.writeNull() == 0) {
107
0
                    return Status::InvalidArgument("writeNull failed");
108
0
                }
109
52
                break;
110
52
            }
111
70
            case simdjson::ondemand::json_type::boolean: {
112
70
                if (writer.writeBool(doc.get_bool()) == 0) {
113
0
                    return Status::InvalidArgument("writeBool failed");
114
0
                }
115
70
                break;
116
70
            }
117
299
            case simdjson::ondemand::json_type::string: {
118
299
                RETURN_IF_ERROR(write_string(doc.get_string(), writer));
119
299
                break;
120
299
            }
121
299
            case simdjson::ondemand::json_type::number: {
122
254
                simdjson::ondemand::number num;
123
254
                simdjson::error_code res = doc.get_number().get(num);
124
254
                if (!parse_number_success(res)) {
125
0
                    return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
126
0
                                                               simdjson::error_message(res)));
127
0
                }
128
                // simdjson get_number() returns a number object, which can be
129
254
                RETURN_IF_ERROR(
130
254
                        write_number(num, doc.get_number_type(), doc.raw_json_token(), writer));
131
248
                break;
132
254
            }
133
2.58k
            }
134
2.58k
        } catch (simdjson::simdjson_error& e) {
135
27
            return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what()));
136
27
        }
137
2.28k
        return Status::OK();
138
2.58k
    }
139
140
private:
141
    // parse json, recursively if necessary, by simdjson
142
    //  and serialize to binary format by writer
143
561k
    static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) {
144
561k
        switch (value.type()) {
145
11.7k
        case simdjson::ondemand::json_type::null: {
146
11.7k
            if (writer.writeNull() == 0) {
147
0
                return Status::InvalidArgument("writeNull failed");
148
0
            }
149
11.7k
            break;
150
11.7k
        }
151
20.5k
        case simdjson::ondemand::json_type::boolean: {
152
20.5k
            if (writer.writeBool(value.get_bool()) == 0) {
153
0
                return Status::InvalidArgument("writeBool failed");
154
0
            }
155
20.5k
            break;
156
20.5k
        }
157
250k
        case simdjson::ondemand::json_type::string: {
158
250k
            RETURN_IF_ERROR(write_string(value.get_string(), writer));
159
250k
            break;
160
250k
        }
161
250k
        case simdjson::ondemand::json_type::number: {
162
224k
            simdjson::ondemand::number num;
163
224k
            auto res = value.get_number().get(num);
164
224k
            if (!parse_number_success(res)) {
165
0
                return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
166
0
                                                           simdjson::error_message(res)));
167
0
            }
168
169
224k
            RETURN_IF_ERROR(
170
224k
                    write_number(num, value.get_number_type(), value.raw_json_token(), writer));
171
224k
            break;
172
224k
        }
173
224k
        case simdjson::ondemand::json_type::object: {
174
2.58k
            if (!writer.writeStartObject()) {
175
0
                return Status::InvalidArgument("writeStartObject failed");
176
0
            }
177
178
6.07k
            for (auto kv : value.get_object()) {
179
6.07k
                std::string_view key;
180
6.07k
                simdjson::error_code e = kv.unescaped_key().get(key);
181
6.07k
                if (e != simdjson::SUCCESS) {
182
2
                    return Status::InvalidArgument(fmt::format("simdjson get key failed: {}", e));
183
2
                }
184
185
                // write key
186
6.07k
                if (key.size() > std::numeric_limits<uint8_t>::max()) {
187
268
                    return Status::InvalidArgument("key size exceeds max limit: {} , {}",
188
268
                                                   key.size(), std::numeric_limits<uint8_t>::max());
189
268
                }
190
5.80k
                if (!writer.writeKey(key.data(), (uint8_t)key.size())) {
191
0
                    return Status::InvalidArgument("writeKey failed : {}", key);
192
0
                }
193
194
                // parse object value
195
5.80k
                RETURN_IF_ERROR(parse(kv.value(), writer));
196
5.80k
            }
197
198
2.22k
            if (!writer.writeEndObject()) {
199
0
                return Status::InvalidArgument("writeEndObject failed");
200
0
                break;
201
0
            }
202
203
2.22k
            break;
204
2.22k
        }
205
51.7k
        case simdjson::ondemand::json_type::array: {
206
51.7k
            if (!writer.writeStartArray()) {
207
0
                return Status::InvalidArgument("writeStartArray failed");
208
0
            }
209
210
553k
            for (auto elem : value.get_array()) {
211
                // parse array element
212
553k
                RETURN_IF_ERROR(parse(elem.value(), writer));
213
553k
            }
214
215
51.5k
            if (!writer.writeEndArray()) {
216
0
                return Status::InvalidArgument("writeEndArray failed");
217
0
            }
218
51.5k
            break;
219
51.5k
        }
220
51.5k
        default: {
221
0
            return Status::InvalidArgument("unknown value type: ");
222
51.5k
        }
223
224
561k
        } // end of switch
225
560k
        return Status::OK();
226
561k
    }
227
228
250k
    static Status write_string(std::string_view str, JsonbWriter& writer) {
229
        // start writing string
230
250k
        if (!writer.writeStartString()) {
231
0
            return Status::InvalidArgument("writeStartString failed");
232
0
        }
233
234
        // write string
235
250k
        if (str.size() > 0) {
236
239k
            if (writer.writeString(str.data(), str.size()) == 0) {
237
0
                return Status::InvalidArgument("writeString failed");
238
0
            }
239
239k
        }
240
241
        // end writing string
242
250k
        if (!writer.writeEndString()) {
243
0
            return Status::InvalidArgument("writeEndString failed");
244
0
        }
245
250k
        return Status::OK();
246
250k
    }
247
248
    static Status write_number(simdjson::ondemand::number num,
249
                               simdjson ::ondemand::number_type num_type,
250
224k
                               std::string_view raw_string, JsonbWriter& writer) {
251
        // The simdjson library supports four types of numbers:
252
        // 1. floating_point_number: A binary64 number, which will be converted to jsonb's double type.
253
        // 2. signed_integer: A signed integer that fits in a 64-bit word using two's complement.
254
        // 3. unsigned_integer: A positive integer larger or equal to 1<<63.
255
        //    For these two integer types, we will convert them to jsonb's int8/int16/int32/int64/int128 types according to the specific value.
256
        // 4. big_integer: An integer that does not fit in a 64-bit word.
257
        //    For this type, simdjson cannot handle it directly. We first try to convert it to jsonb's int128 type.
258
        //    If conversion fails, we attempt to convert it to a double type.
259
        //    If conversion to double also fails, an error is returned.
260
261
224k
        switch (num_type) {
262
115k
        case simdjson::ondemand::number_type::floating_point_number: {
263
115k
            double number = num.get_double();
264
            // When a double exceeds the precision that can be represented by a double type in simdjson, it gets converted to 0.
265
            // The correct approach, should be to truncate the double value instead.
266
115k
            if (number == 0) {
267
13.7k
                StringParser::ParseResult result;
268
13.7k
                number = StringParser::string_to_float<double>(raw_string.data(), raw_string.size(),
269
13.7k
                                                               &result);
270
13.7k
                if (result != StringParser::PARSE_SUCCESS) {
271
7
                    return Status::InvalidArgument("invalid number, raw string is: " +
272
7
                                                   std::string(raw_string));
273
7
                }
274
13.7k
            }
275
276
115k
            if (writer.writeDouble(number) == 0) {
277
0
                return Status::InvalidArgument("writeDouble failed");
278
0
            }
279
280
115k
            break;
281
115k
        }
282
115k
        case simdjson::ondemand::number_type::signed_integer:
283
109k
        case simdjson::ondemand::number_type::unsigned_integer: {
284
109k
            int128_t val = num.is_int64() ? (int128_t)num.get_int64() : (int128_t)num.get_uint64();
285
109k
            bool success = false;
286
109k
            if (val >= std::numeric_limits<int8_t>::min() &&
287
109k
                val <= std::numeric_limits<int8_t>::max()) {
288
47.9k
                success = writer.writeInt8((int8_t)val);
289
61.1k
            } else if (val >= std::numeric_limits<int16_t>::min() &&
290
61.1k
                       val <= std::numeric_limits<int16_t>::max()) {
291
97
                success = writer.writeInt16((int16_t)val);
292
61.0k
            } else if (val >= std::numeric_limits<int32_t>::min() &&
293
61.0k
                       val <= std::numeric_limits<int32_t>::max()) {
294
30
                success = writer.writeInt32((int32_t)val);
295
60.9k
            } else if (val >= std::numeric_limits<int64_t>::min() &&
296
60.9k
                       val <= std::numeric_limits<int64_t>::max()) {
297
60.9k
                success = writer.writeInt64((int64_t)val);
298
60.9k
            } else { // INT128
299
12
                success = writer.writeInt128(val);
300
12
            }
301
302
109k
            if (!success) {
303
0
                return Status::InvalidArgument("writeInt failed");
304
0
            }
305
109k
            break;
306
109k
        }
307
109k
        case simdjson::ondemand::number_type::big_integer: {
308
3
            StringParser::ParseResult result;
309
3
            auto val = StringParser::string_to_int<int128_t>(raw_string.data(), raw_string.size(),
310
3
                                                             &result);
311
3
            if (result != StringParser::PARSE_SUCCESS) {
312
                // If the string exceeds the range of int128_t, it will attempt to convert it to double.
313
                // This may result in loss of precision, but for JSON, exchanging data as plain text between different systems may inherently cause precision loss.
314
                // try parse as double
315
1
                double double_val = StringParser::string_to_float<double>(
316
1
                        raw_string.data(), raw_string.size(), &result);
317
1
                if (result != StringParser::PARSE_SUCCESS) {
318
                    // if both parse failed, return error
319
0
                    return Status::InvalidArgument("invalid number, raw string is: " +
320
0
                                                   std::string(raw_string));
321
0
                }
322
1
                if (!writer.writeDouble(double_val)) {
323
0
                    return Status::InvalidArgument("writeDouble failed");
324
0
                }
325
2
            } else {
326
                // as int128_t
327
2
                if (!writer.writeInt128(val)) {
328
0
                    return Status::InvalidArgument("writeInt128 failed");
329
0
                }
330
2
            }
331
3
            break;
332
3
        }
333
224k
        }
334
224k
        return Status::OK();
335
224k
    }
336
};
337
#include "common/compile_check_end.h"
338
} // namespace doris