Coverage Report

Created: 2026-03-13 09:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/jsonb_parser_simd.h
Line
Count
Source
1
/*
2
 *  Copyright (c) 2014, Facebook, Inc.
3
 *  All rights reserved.
4
 *
5
 *  This source code is licensed under the BSD-style license found in the
6
 *  LICENSE file in the root directory of this source tree. An additional grant
7
 *  of patent rights can be found in the PATENTS file in the same directory.
8
 *
9
 */
10
11
/*
12
 * This file defines JsonbParserTSIMD (template) and JsonbParser.
13
 *
14
 * JsonbParserTSIMD is a template class which implements a JSON parser.
15
 * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format
16
 * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new
17
 * JsonbWriterT object with an output stream object.  However, you can also
18
 * pass in your JsonbWriterT or any stream object that implements some basic
19
 * interface of std::ostream (see JsonbStream.h).
20
 *
21
 * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see
22
 * JsonbStream.h). So unless you want to provide own a different output stream
23
 * type, use JsonbParser object.
24
 *
25
 * ** Parsing JSON **
26
 * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB
27
 * packed bytes. There are three ways to parse a JSON string: (1) using
28
 * c-string, (2) using string with len, (3) using std::istream object. You can
29
 * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used
30
 * internally if the input is raw character buffer.
31
 *
32
 * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON
33
 * strings, and the previous JSONB will be overwritten.
34
 *
35
 * If parsing fails (returned false), the error code will be set to one of
36
 * JsonbErrType, and can be retrieved by calling getErrorCode().
37
 *
38
 * ** External dictionary **
39
 * During parsing a JSON string, you can pass a call-back function to map a key
40
 * string to an id, and store the dictionary id in JSONB to save space. The
41
 * purpose of using an external dictionary is more towards a collection of
42
 * documents (which has common keys) rather than a single document, so that
43
 * space saving will be significant.
44
 *
45
 * ** Endianness **
46
 * Note: JSONB serialization doesn't assume endianness of the server. However
47
 * you will need to ensure that the endianness at the reader side is the same
48
 * as that at the writer side (if they are on different machines). Otherwise,
49
 * proper conversion is needed when a number value is returned to the
50
 * caller/writer.
51
 *
52
 * @author Tian Xia <tianx@fb.com>
53
 * 
54
 * this file is copied from 
55
 * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h
56
 * and modified by Doris
57
 */
58
59
#pragma once
60
#include <simdjson.h>
61
62
#include <cmath>
63
#include <limits>
64
65
#include "common/status.h"
66
#include "util/jsonb_document.h"
67
#include "util/jsonb_writer.h"
68
#include "util/string_parser.hpp"
69
70
namespace doris {
71
#include "common/compile_check_begin.h"
72
using int128_t = __int128;
73
struct JsonbParser {
74
    // According to https://github.com/simdjson/simdjson/pull/2139
75
    // For numbers larger than 64 bits, we can obtain the raw_json_token and parse it ourselves.
76
    // This allows handling numbers larger than 64 bits, such as int128.
77
    // For example, try to parse a 18446744073709551616, this number is just 1 greater than the maximum value of uint64_t, and simdjson will return a NUMBER_ERROR
78
    // If try to parse a 18446744073709551616231231, it is obviously a large integer, at this time simdjson will return a BIGINT_ERROR
79
344k
    static bool parse_number_success(simdjson::error_code error_code) {
80
344k
        return error_code == simdjson::error_code::SUCCESS ||
81
344k
               error_code == simdjson::error_code::NUMBER_ERROR ||
82
344k
               error_code == simdjson::error_code::BIGINT_ERROR;
83
344k
    }
84
85
    // parse a UTF-8 JSON string with length
86
    // will reset writer before parse
87
59.1k
    static Status parse(const char* pch, size_t len, JsonbWriter& writer) {
88
59.1k
        if (!pch || len == 0) {
89
21
            return Status::InvalidArgument("Empty JSON document");
90
21
        }
91
59.1k
        writer.reset();
92
59.1k
        try {
93
59.1k
            simdjson::ondemand::parser simdjson_parser;
94
59.1k
            simdjson::padded_string json_str {pch, len};
95
59.1k
            simdjson::ondemand::document doc = simdjson_parser.iterate(json_str);
96
97
            // simdjson process top level primitive types specially
98
            // so some repeated code here
99
59.1k
            switch (doc.type()) {
100
45.5k
            case simdjson::ondemand::json_type::object:
101
56.9k
            case simdjson::ondemand::json_type::array: {
102
56.9k
                RETURN_IF_ERROR(parse(doc.get_value(), writer));
103
56.6k
                break;
104
56.9k
            }
105
56.6k
            case simdjson::ondemand::json_type::null: {
106
122
                if (writer.writeNull() == 0) {
107
0
                    return Status::InvalidArgument("writeNull failed");
108
0
                }
109
122
                break;
110
122
            }
111
135
            case simdjson::ondemand::json_type::boolean: {
112
135
                if (writer.writeBool(doc.get_bool()) == 0) {
113
0
                    return Status::InvalidArgument("writeBool failed");
114
0
                }
115
135
                break;
116
135
            }
117
335
            case simdjson::ondemand::json_type::string: {
118
335
                RETURN_IF_ERROR(write_string(doc.get_string(), writer));
119
335
                break;
120
335
            }
121
545
            case simdjson::ondemand::json_type::number: {
122
545
                simdjson::ondemand::number num;
123
545
                simdjson::error_code res = doc.get_number().get(num);
124
545
                if (!parse_number_success(res)) {
125
0
                    return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
126
0
                                                               simdjson::error_message(res)));
127
0
                }
128
                // simdjson get_number() returns a number object, which can be
129
545
                RETURN_IF_ERROR(
130
545
                        write_number(num, doc.get_number_type(), doc.raw_json_token(), writer));
131
487
                break;
132
545
            }
133
59.1k
            }
134
59.1k
        } catch (simdjson::simdjson_error& e) {
135
1.14k
            return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what()));
136
1.14k
        }
137
57.6k
        return Status::OK();
138
59.1k
    }
139
140
private:
141
    // parse json, recursively if necessary, by simdjson
142
    //  and serialize to binary format by writer
143
901k
    static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) {
144
901k
        switch (value.type()) {
145
11.9k
        case simdjson::ondemand::json_type::null: {
146
11.9k
            if (writer.writeNull() == 0) {
147
0
                return Status::InvalidArgument("writeNull failed");
148
0
            }
149
11.9k
            break;
150
11.9k
        }
151
21.3k
        case simdjson::ondemand::json_type::boolean: {
152
21.3k
            if (writer.writeBool(value.get_bool()) == 0) {
153
0
                return Status::InvalidArgument("writeBool failed");
154
0
            }
155
21.3k
            break;
156
21.3k
        }
157
385k
        case simdjson::ondemand::json_type::string: {
158
385k
            RETURN_IF_ERROR(write_string(value.get_string(), writer));
159
385k
            break;
160
385k
        }
161
385k
        case simdjson::ondemand::json_type::number: {
162
343k
            simdjson::ondemand::number num;
163
343k
            auto res = value.get_number().get(num);
164
343k
            if (!parse_number_success(res)) {
165
0
                return Status::InvalidArgument(fmt::format("simdjson get_number failed: {}",
166
0
                                                           simdjson::error_message(res)));
167
0
            }
168
169
343k
            RETURN_IF_ERROR(
170
343k
                    write_number(num, value.get_number_type(), value.raw_json_token(), writer));
171
343k
            break;
172
343k
        }
173
343k
        case simdjson::ondemand::json_type::object: {
174
64.9k
            if (!writer.writeStartObject()) {
175
0
                return Status::InvalidArgument("writeStartObject failed");
176
0
            }
177
178
209k
            for (auto kv : value.get_object()) {
179
209k
                std::string_view key;
180
209k
                simdjson::error_code e = kv.unescaped_key().get(key);
181
209k
                if (e != simdjson::SUCCESS) {
182
41
                    return Status::InvalidArgument(fmt::format("simdjson get key failed: {}", e));
183
41
                }
184
185
                // write key
186
209k
                if (key.size() > std::numeric_limits<uint8_t>::max()) {
187
268
                    return Status::InvalidArgument("key size exceeds max limit: {} , {}",
188
268
                                                   key.size(), std::numeric_limits<uint8_t>::max());
189
268
                }
190
208k
                if (!writer.writeKey(key.data(), (uint8_t)key.size())) {
191
0
                    return Status::InvalidArgument("writeKey failed : {}", key);
192
0
                }
193
194
                // parse object value
195
208k
                RETURN_IF_ERROR(parse(kv.value(), writer));
196
208k
            }
197
198
64.5k
            if (!writer.writeEndObject()) {
199
0
                return Status::InvalidArgument("writeEndObject failed");
200
0
                break;
201
0
            }
202
203
64.5k
            break;
204
64.5k
        }
205
74.2k
        case simdjson::ondemand::json_type::array: {
206
74.2k
            if (!writer.writeStartArray()) {
207
0
                return Status::InvalidArgument("writeStartArray failed");
208
0
            }
209
210
636k
            for (auto elem : value.get_array()) {
211
                // parse array element
212
636k
                RETURN_IF_ERROR(parse(elem.value(), writer));
213
636k
            }
214
215
74.0k
            if (!writer.writeEndArray()) {
216
0
                return Status::InvalidArgument("writeEndArray failed");
217
0
            }
218
74.0k
            break;
219
74.0k
        }
220
74.0k
        default: {
221
0
            return Status::InvalidArgument("unknown value type: ");
222
74.0k
        }
223
224
901k
        } // end of switch
225
901k
        return Status::OK();
226
901k
    }
227
228
386k
    static Status write_string(std::string_view str, JsonbWriter& writer) {
229
        // start writing string
230
386k
        if (!writer.writeStartString()) {
231
0
            return Status::InvalidArgument("writeStartString failed");
232
0
        }
233
234
        // write string
235
386k
        if (str.size() > 0) {
236
374k
            if (writer.writeString(str.data(), str.size()) == 0) {
237
0
                return Status::InvalidArgument("writeString failed");
238
0
            }
239
374k
        }
240
241
        // end writing string
242
386k
        if (!writer.writeEndString()) {
243
0
            return Status::InvalidArgument("writeEndString failed");
244
0
        }
245
386k
        return Status::OK();
246
386k
    }
247
248
    static Status write_number(simdjson::ondemand::number num,
249
                               simdjson ::ondemand::number_type num_type,
250
344k
                               std::string_view raw_string, JsonbWriter& writer) {
251
        // The simdjson library supports four types of numbers:
252
        // 1. floating_point_number: A binary64 number, which will be converted to jsonb's double type.
253
        // 2. signed_integer: A signed integer that fits in a 64-bit word using two's complement.
254
        // 3. unsigned_integer: A positive integer larger or equal to 1<<63.
255
        //    For these two integer types, we will convert them to jsonb's int8/int16/int32/int64/int128 types according to the specific value.
256
        // 4. big_integer: An integer that does not fit in a 64-bit word.
257
        //    For this type, simdjson cannot handle it directly. We first try to convert it to jsonb's int128 type.
258
        //    If conversion fails, we attempt to convert it to a double type.
259
        //    If conversion to double also fails, an error is returned.
260
261
344k
        switch (num_type) {
262
140k
        case simdjson::ondemand::number_type::floating_point_number: {
263
140k
            double number = num.get_double();
264
            // When a double exceeds the precision that can be represented by a double type in simdjson, it gets converted to 0.
265
            // The correct approach, should be to truncate the double value instead.
266
140k
            if (number == 0) {
267
13.7k
                StringParser::ParseResult result;
268
13.7k
                number = StringParser::string_to_float<double>(raw_string.data(), raw_string.size(),
269
13.7k
                                                               &result);
270
13.7k
                if (result != StringParser::PARSE_SUCCESS) {
271
59
                    return Status::InvalidArgument("invalid number, raw string is: " +
272
59
                                                   std::string(raw_string));
273
59
                }
274
13.7k
            }
275
276
140k
            if (writer.writeDouble(number) == 0) {
277
0
                return Status::InvalidArgument("writeDouble failed");
278
0
            }
279
280
140k
            break;
281
140k
        }
282
204k
        case simdjson::ondemand::number_type::signed_integer:
283
204k
        case simdjson::ondemand::number_type::unsigned_integer: {
284
204k
            int128_t val = num.is_int64() ? (int128_t)num.get_int64() : (int128_t)num.get_uint64();
285
204k
            bool success = false;
286
204k
            if (val >= std::numeric_limits<int8_t>::min() &&
287
204k
                val <= std::numeric_limits<int8_t>::max()) {
288
76.4k
                success = writer.writeInt8((int8_t)val);
289
127k
            } else if (val >= std::numeric_limits<int16_t>::min() &&
290
127k
                       val <= std::numeric_limits<int16_t>::max()) {
291
50.8k
                success = writer.writeInt16((int16_t)val);
292
76.8k
            } else if (val >= std::numeric_limits<int32_t>::min() &&
293
76.8k
                       val <= std::numeric_limits<int32_t>::max()) {
294
8.44k
                success = writer.writeInt32((int32_t)val);
295
68.3k
            } else if (val >= std::numeric_limits<int64_t>::min() &&
296
68.4k
                       val <= std::numeric_limits<int64_t>::max()) {
297
68.4k
                success = writer.writeInt64((int64_t)val);
298
18.4E
            } else { // INT128
299
18.4E
                success = writer.writeInt128(val);
300
18.4E
            }
301
302
204k
            if (!success) {
303
0
                return Status::InvalidArgument("writeInt failed");
304
0
            }
305
204k
            break;
306
204k
        }
307
204k
        case simdjson::ondemand::number_type::big_integer: {
308
12
            StringParser::ParseResult result;
309
12
            auto val = StringParser::string_to_int<int128_t>(raw_string.data(), raw_string.size(),
310
12
                                                             &result);
311
12
            if (result != StringParser::PARSE_SUCCESS) {
312
                // If the string exceeds the range of int128_t, it will attempt to convert it to double.
313
                // This may result in loss of precision, but for JSON, exchanging data as plain text between different systems may inherently cause precision loss.
314
                // try parse as double
315
2
                double double_val = StringParser::string_to_float<double>(
316
2
                        raw_string.data(), raw_string.size(), &result);
317
2
                if (result != StringParser::PARSE_SUCCESS) {
318
                    // if both parse failed, return error
319
0
                    return Status::InvalidArgument("invalid number, raw string is: " +
320
0
                                                   std::string(raw_string));
321
0
                }
322
2
                if (!writer.writeDouble(double_val)) {
323
0
                    return Status::InvalidArgument("writeDouble failed");
324
0
                }
325
10
            } else {
326
                // as int128_t
327
10
                if (!writer.writeInt128(val)) {
328
0
                    return Status::InvalidArgument("writeInt128 failed");
329
0
                }
330
10
            }
331
12
            break;
332
12
        }
333
344k
        }
334
344k
        return Status::OK();
335
344k
    }
336
};
337
#include "common/compile_check_end.h"
338
} // namespace doris