Coverage Report

Created: 2026-02-23 23:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/exprs/json_functions.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/json_functions.h"
19
20
#include <rapidjson/allocators.h>
21
#include <rapidjson/document.h>
22
#include <rapidjson/encodings.h>
23
#include <rapidjson/rapidjson.h>
24
#include <rapidjson/stringbuffer.h>
25
#include <rapidjson/writer.h>
26
#include <re2/re2.h>
27
#include <simdjson/error.h>
28
#include <simdjson/simdjson.h> // IWYU pragma: keep
29
#include <stdlib.h>
30
31
#include <boost/iterator/iterator_facade.hpp>
32
#include <boost/token_functions.hpp>
33
#include <boost/tokenizer.hpp>
34
#include <sstream>
35
#include <string>
36
#include <vector>
37
38
#include "common/compiler_util.h" // IWYU pragma: keep
39
#include "common/exception.h"
40
#include "common/logging.h"
41
42
namespace doris {
43
44
// static const re2::RE2 JSON_PATTERN("^([a-zA-Z0-9_\\-\\:\\s#\\|\\.]*)(?:\\[([0-9]+)\\])?");
45
// json path cannot contains: ", [, ]
46
static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?");
47
48
rapidjson::Value* NO_SANITIZE_UNDEFINED
49
JsonFunctions::match_value(const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
50
16
                           rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null) {
51
16
    rapidjson::Value* root = document;
52
16
    rapidjson::Value* array_obj = nullptr;
53
42
    for (int i = 1; i < parsed_paths.size(); i++) {
54
26
        VLOG_TRACE << "parsed_paths: " << parsed_paths[i].debug_string();
55
56
26
        if (root == nullptr || root->IsNull()) {
57
0
            return nullptr;
58
0
        }
59
60
26
        if (UNLIKELY(!parsed_paths[i].is_valid)) {
61
0
            return nullptr;
62
0
        }
63
64
26
        const std::string& col = parsed_paths[i].key;
65
26
        int index = parsed_paths[i].idx;
66
26
        if (LIKELY(!col.empty())) {
67
18
            if (root->IsArray()) {
68
10
                array_obj = static_cast<rapidjson::Value*>(
69
10
                        mem_allocator.Malloc(sizeof(rapidjson::Value)));
70
10
                array_obj->SetArray();
71
10
                bool is_null = true;
72
73
                // if array ,loop the array,find out all Objects,then find the results from the objects
74
32
                for (int j = 0; j < root->Size(); j++) {
75
22
                    rapidjson::Value* json_elem = &((*root)[j]);
76
77
22
                    if (json_elem->IsArray() || json_elem->IsNull()) {
78
0
                        continue;
79
22
                    } else {
80
22
                        if (!json_elem->IsObject()) {
81
0
                            continue;
82
0
                        }
83
22
                        if (!json_elem->HasMember(col.c_str())) {
84
2
                            if (is_insert_null) { // not found item, then insert a null object.
85
2
                                is_null = false;
86
2
                                rapidjson::Value nullObject(rapidjson::kNullType);
87
2
                                array_obj->PushBack(nullObject, mem_allocator);
88
2
                            }
89
2
                            continue;
90
2
                        }
91
20
                        rapidjson::Value* obj = &((*json_elem)[col.c_str()]);
92
20
                        if (obj->IsArray()) {
93
0
                            is_null = false;
94
0
                            for (int k = 0; k < obj->Size(); k++) {
95
0
                                array_obj->PushBack((*obj)[k], mem_allocator);
96
0
                            }
97
20
                        } else if (!obj->IsNull()) {
98
20
                            is_null = false;
99
20
                            array_obj->PushBack(*obj, mem_allocator);
100
20
                        }
101
20
                    }
102
22
                }
103
104
10
                root = is_null ? &(array_obj->SetNull()) : array_obj;
105
10
            } else if (root->IsObject()) {
106
8
                if (!root->HasMember(col.c_str())) {
107
0
                    return nullptr;
108
8
                } else {
109
8
                    root = &((*root)[col.c_str()]);
110
8
                }
111
8
            } else {
112
                // root is not a nested type, return nullptr
113
0
                return nullptr;
114
0
            }
115
18
        }
116
117
26
        if (UNLIKELY(index != -1)) {
118
            // judge the rapidjson:Value, which base the top's result,
119
            // if not array return nullptr;else get the index value from the array
120
8
            if (root->IsArray()) {
121
8
                if (root->IsNull()) {
122
0
                    return nullptr;
123
8
                } else if (index == -2) {
124
                    // [*]
125
8
                    array_obj = static_cast<rapidjson::Value*>(
126
8
                            mem_allocator.Malloc(sizeof(rapidjson::Value)));
127
8
                    array_obj->SetArray();
128
129
26
                    for (int j = 0; j < root->Size(); j++) {
130
18
                        rapidjson::Value v;
131
18
                        v.CopyFrom((*root)[j], mem_allocator);
132
18
                        array_obj->PushBack(v, mem_allocator);
133
18
                    }
134
8
                    root = array_obj;
135
8
                } else if (index >= root->Size()) {
136
0
                    return nullptr;
137
0
                } else {
138
0
                    root = &((*root)[index]);
139
0
                }
140
8
            } else {
141
0
                return nullptr;
142
0
            }
143
8
        }
144
26
    }
145
16
    return root;
146
16
}
147
148
rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json(
149
        const std::string& json_path, rapidjson::Value* document,
150
18
        rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) {
151
18
    std::vector<JsonPath> vec;
152
18
    parse_json_paths(json_path, &vec);
153
18
    return get_json_array_from_parsed_json(vec, document, mem_allocator, wrap_explicitly);
154
18
}
155
156
rapidjson::Value* NO_SANITIZE_UNDEFINED JsonFunctions::get_json_array_from_parsed_json(
157
        const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
158
18
        rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) {
159
18
    *wrap_explicitly = false;
160
18
    if (!parsed_paths[0].is_valid) {
161
0
        return nullptr;
162
0
    }
163
164
18
    if (parsed_paths.size() == 1) {
165
        // the json path is "$", just return entire document
166
        // wrapper an array
167
2
        rapidjson::Value* array_obj = nullptr;
168
2
        array_obj = static_cast<rapidjson::Value*>(mem_allocator.Malloc(sizeof(rapidjson::Value)));
169
2
        array_obj->SetArray();
170
2
        array_obj->PushBack(*document, mem_allocator);
171
2
        return array_obj;
172
2
    }
173
174
16
    rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true);
175
16
    if (root == nullptr || root == document) { // not found
176
0
        return nullptr;
177
16
    } else if (!root->IsArray() && wrap_explicitly) {
178
2
        rapidjson::Value* array_obj = nullptr;
179
2
        array_obj = static_cast<rapidjson::Value*>(mem_allocator.Malloc(sizeof(rapidjson::Value)));
180
2
        array_obj->SetArray();
181
2
        rapidjson::Value copy;
182
2
        copy.CopyFrom(*root, mem_allocator);
183
2
        array_obj->PushBack(std::move(copy), mem_allocator);
184
        // set `wrap_explicitly` to true, so that the caller knows that this Array is wrapped actively.
185
2
        *wrap_explicitly = true;
186
2
        return array_obj;
187
2
    }
188
14
    return root;
189
16
}
190
191
rapidjson::Value* JsonFunctions::get_json_object_from_parsed_json(
192
        const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
193
0
        rapidjson::Document::AllocatorType& mem_allocator) {
194
0
    if (!parsed_paths[0].is_valid) {
195
0
        return nullptr;
196
0
    }
197
198
0
    if (parsed_paths.size() == 1) {
199
        // the json path is "$", just return entire document
200
0
        return document;
201
0
    }
202
203
0
    rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true);
204
0
    if (root == nullptr || root == document) { // not found
205
0
        return nullptr;
206
0
    }
207
0
    return root;
208
0
}
209
210
void JsonFunctions::parse_json_paths(const std::string& path_string,
211
18
                                     std::vector<JsonPath>* parsed_paths) {
212
    // split path by ".", and escape quota by "\"
213
    // eg:
214
    //    '$.text#abc.xyz'  ->  [$, text#abc, xyz]
215
    //    '$."text.abc".xyz'  ->  [$, text.abc, xyz]
216
    //    '$."text.abc"[1].xyz'  ->  [$, text.abc[1], xyz]
217
18
    try {
218
18
        boost::tokenizer<boost::escaped_list_separator<char>> tok(
219
18
                path_string, boost::escaped_list_separator<char>("\\", ".", "\""));
220
18
        std::vector<std::string> paths(tok.begin(), tok.end());
221
18
        get_parsed_paths(paths, parsed_paths);
222
18
    } catch (const boost::escaped_list_error& err) {
223
0
        throw doris::Exception(ErrorCode::INVALID_JSON_PATH, "meet error {}", err.what());
224
0
    }
225
18
}
226
227
void JsonFunctions::get_parsed_paths(const std::vector<std::string>& path_exprs,
228
18
                                     std::vector<JsonPath>* parsed_paths) {
229
18
    if (path_exprs.empty()) {
230
0
        return;
231
0
    }
232
233
18
    if (path_exprs[0] != "$") {
234
0
        parsed_paths->emplace_back("", -1, false);
235
18
    } else {
236
18
        parsed_paths->emplace_back("$", -1, true);
237
18
    }
238
239
44
    for (int i = 1; i < path_exprs.size(); i++) {
240
26
        std::string col;
241
26
        std::string index;
242
26
        if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) {
243
0
            parsed_paths->emplace_back("", -1, false);
244
26
        } else {
245
26
            int idx = -1;
246
26
            if (!index.empty()) {
247
8
                if (index == "*") {
248
8
                    idx = -2;
249
8
                } else {
250
0
                    idx = atoi(index.c_str());
251
0
                }
252
8
            }
253
26
            parsed_paths->emplace_back(std::move(col), idx, true);
254
26
        }
255
26
    }
256
18
}
257
258
Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj,
259
                                          const std::vector<JsonPath>& jsonpath,
260
0
                                          simdjson::ondemand::value* value) noexcept {
261
// Return DataQualityError when it's a malformed json.
262
// Otherwise the path was not found, due to
263
// 1. array out of bound
264
// 2. not exist such field in object
265
// 3. the input type is not object but could be null or other types and lead to simdjson::INCORRECT_TYPE
266
0
#define HANDLE_SIMDJSON_ERROR(err, msg)                                                     \
267
0
    do {                                                                                    \
268
0
        const simdjson::error_code& _err = err;                                             \
269
0
        const std::string& _msg = msg;                                                      \
270
0
        if (UNLIKELY(_err)) {                                                               \
271
0
            if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS || \
272
0
                _err == simdjson::INCORRECT_TYPE) {                                         \
273
0
                return Status::NotFound<false>(                                             \
274
0
                        fmt::format("Not found target filed, err: {}, msg: {}",             \
275
0
                                    simdjson::error_message(_err), _msg));                  \
276
0
            }                                                                               \
277
0
            return Status::DataQualityError(                                                \
278
0
                    fmt::format("err: {}, msg: {}", simdjson::error_message(_err), _msg));  \
279
0
        }                                                                                   \
280
0
    } while (false);
281
282
0
    if (jsonpath.size() <= 1) {
283
        // The first elem of json path should be '$'.
284
        // A valid json path's size is >= 2.
285
0
        return Status::DataQualityError("empty json path");
286
0
    }
287
288
0
    simdjson::ondemand::value tvalue;
289
290
    // Skip the first $.
291
0
    for (int i = 1; i < jsonpath.size(); i++) {
292
0
        if (UNLIKELY(!jsonpath[i].is_valid)) {
293
0
            return Status::DataQualityError(fmt::format("invalid json path: {}", jsonpath[i].key));
294
0
        }
295
296
0
        const std::string& col = jsonpath[i].key;
297
0
        int index = jsonpath[i].idx;
298
299
        // Since the simdjson::ondemand::object cannot be converted to simdjson::ondemand::value,
300
        // we have to do some special treatment for the second elem of json path.
301
        // If the key is not found in json object, simdjson::NO_SUCH_FIELD would be returned.
302
0
        if (i == 1) {
303
0
            HANDLE_SIMDJSON_ERROR(obj.find_field_unordered(col).get(tvalue),
304
0
                                  fmt::format("unable to find field: {}", col));
305
0
        } else {
306
0
            HANDLE_SIMDJSON_ERROR(tvalue.find_field_unordered(col).get(tvalue),
307
0
                                  fmt::format("unable to find field: {}", col));
308
0
        }
309
310
        // TODO support [*] which idex == -2
311
0
        if (index != -1) {
312
            // try to access tvalue as array.
313
            // If the index is beyond the length of array, simdjson::INDEX_OUT_OF_BOUNDS would be returned.
314
0
            simdjson::ondemand::array arr;
315
0
            HANDLE_SIMDJSON_ERROR(tvalue.get_array().get(arr),
316
0
                                  fmt::format("failed to access field as array, field: {}", col));
317
318
0
            HANDLE_SIMDJSON_ERROR(
319
0
                    arr.at(index).get(tvalue),
320
0
                    fmt::format("failed to access array field: {}, index: {}", col, index));
321
0
        }
322
0
    }
323
324
0
    std::swap(*value, tvalue);
325
326
0
    return Status::OK();
327
0
}
328
329
0
std::string JsonFunctions::print_json_value(const rapidjson::Value& value) {
330
0
    rapidjson::StringBuffer buffer;
331
0
    buffer.Clear();
332
0
    rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
333
0
    value.Accept(writer);
334
0
    return std::string(buffer.GetString());
335
0
}
336
337
void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value& src_object,
338
0
                                  rapidjson::Document::AllocatorType& allocator) {
339
0
    if (!src_object.IsObject()) {
340
0
        return;
341
0
    }
342
0
    VLOG_DEBUG << "merge from src: " << print_json_value(src_object)
343
0
               << ", to: " << print_json_value(dst_object);
344
0
    for (auto src_it = src_object.MemberBegin(); src_it != src_object.MemberEnd(); ++src_it) {
345
0
        auto dst_it = dst_object.FindMember(src_it->name);
346
0
        if (dst_it != dst_object.MemberEnd()) {
347
0
            if (src_it->value.IsObject() && dst_it->value.IsObject()) {
348
0
                merge_objects(dst_it->value, src_it->value, allocator);
349
0
            } else {
350
0
                if (dst_it->value.IsNull()) {
351
0
                    dst_it->value = src_it->value;
352
0
                }
353
0
            }
354
0
        } else {
355
0
            dst_object.AddMember(src_it->name, src_it->value, allocator);
356
0
        }
357
0
    }
358
0
}
359
360
// root path "$."
361
0
bool JsonFunctions::is_root_path(const std::vector<JsonPath>& json_path) {
362
0
    return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty();
363
0
}
364
365
} // namespace doris