Coverage Report

Created: 2026-03-25 18:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/es/es_scroll_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/es/es_scroll_parser.h"
19
20
#include <absl/strings/substitute.h>
21
#include <cctz/time_zone.h>
22
#include <glog/logging.h>
23
#include <rapidjson/allocators.h>
24
#include <rapidjson/encodings.h>
25
#include <stdint.h>
26
#include <string.h>
27
28
// IWYU pragma: no_include <bits/chrono.h>
29
#include <chrono> // IWYU pragma: keep
30
#include <cstdlib>
31
#include <ostream>
32
#include <string>
33
34
#include "common/status.h"
35
#include "core/binary_cast.hpp"
36
#include "core/column/column.h"
37
#include "core/column/column_nullable.h"
38
#include "core/data_type/data_type_array.h"
39
#include "core/data_type/data_type_nullable.h"
40
#include "core/data_type/define_primitive_type.h"
41
#include "core/data_type/primitive_type.h"
42
#include "core/field.h"
43
#include "core/value/decimalv2_value.h"
44
#include "core/value/jsonb_value.h"
45
#include "core/value/vdatetime_value.h"
46
#include "exprs/function/cast/cast_to_date_or_datetime_impl.hpp"
47
#include "exprs/function/cast/cast_to_datetimev2_impl.hpp"
48
#include "exprs/function/cast/cast_to_datev2_impl.hpp"
49
#include "rapidjson/document.h"
50
#include "rapidjson/rapidjson.h"
51
#include "rapidjson/stringbuffer.h"
52
#include "rapidjson/writer.h"
53
#include "runtime/descriptors.h"
54
#include "util/string_parser.hpp"
55
56
namespace doris {
57
#include "common/compile_check_begin.h"
58
59
static const char* FIELD_SCROLL_ID = "_scroll_id";
60
static const char* FIELD_HITS = "hits";
61
static const char* FIELD_INNER_HITS = "hits";
62
static const char* FIELD_SOURCE = "_source";
63
static const char* FIELD_ID = "_id";
64
65
// get the original json data type
66
0
std::string json_type_to_string(rapidjson::Type type) {
67
0
    switch (type) {
68
0
    case rapidjson::kNumberType:
69
0
        return "Number";
70
0
    case rapidjson::kStringType:
71
0
        return "Varchar/Char";
72
0
    case rapidjson::kArrayType:
73
0
        return "Array";
74
0
    case rapidjson::kObjectType:
75
0
        return "Object";
76
0
    case rapidjson::kNullType:
77
0
        return "Null Type";
78
0
    case rapidjson::kFalseType:
79
0
    case rapidjson::kTrueType:
80
0
        return "True/False";
81
0
    default:
82
0
        return "Unknown Type";
83
0
    }
84
0
}
85
86
// transfer rapidjson::Value to string representation
87
0
std::string json_value_to_string(const rapidjson::Value& value) {
88
0
    rapidjson::StringBuffer scratch_buffer;
89
0
    rapidjson::Writer<rapidjson::StringBuffer> temp_writer(scratch_buffer);
90
0
    value.Accept(temp_writer);
91
0
    return scratch_buffer.GetString();
92
0
}
93
94
static const std::string ERROR_INVALID_COL_DATA =
95
        "Data source returned inconsistent column data. "
96
        "Expected value of type {} based on column metadata. This likely indicates a "
97
        "problem with the data source library.";
98
static const std::string ERROR_MEM_LIMIT_EXCEEDED =
99
        "DataSourceScanNode::$0() failed to allocate "
100
        "$1 bytes for $2.";
101
static const std::string ERROR_COL_DATA_IS_ARRAY =
102
        "Data source returned an array for the type $0"
103
        "based on column metadata.";
104
static const std::string INVALID_NULL_VALUE =
105
        "Invalid null value occurs: Non-null column `$0` contains NULL";
106
107
#define RETURN_ERROR_IF_COL_IS_ARRAY(col, type, is_array)                    \
108
0
    do {                                                                     \
109
0
        if (col.IsArray() == is_array) {                                     \
110
0
            std::stringstream ss;                                            \
111
0
            ss << "Expected value of type: " << type_to_string(type)         \
112
0
               << "; but found type: " << json_type_to_string(col.GetType()) \
113
0
               << "; Document slice is : " << json_value_to_string(col);     \
114
0
            return Status::RuntimeError(ss.str());                           \
115
0
        }                                                                    \
116
0
    } while (false)
117
118
#define RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type)                            \
119
0
    do {                                                                        \
120
0
        if (!col.IsString()) {                                                  \
121
0
            std::stringstream ss;                                               \
122
0
            ss << "Expected value of type: " << type_to_string(type)            \
123
0
               << "; but found type: " << json_type_to_string(col.GetType())    \
124
0
               << "; Document source slice is : " << json_value_to_string(col); \
125
0
            return Status::RuntimeError(ss.str());                              \
126
0
        }                                                                       \
127
0
    } while (false)
128
129
#define RETURN_ERROR_IF_COL_IS_NOT_NUMBER(col, type)                         \
130
0
    do {                                                                     \
131
0
        if (!col.IsNumber()) {                                               \
132
0
            std::stringstream ss;                                            \
133
0
            ss << "Expected value of type: " << type_to_string(type)         \
134
0
               << "; but found type: " << json_type_to_string(col.GetType()) \
135
0
               << "; Document value is: " << json_value_to_string(col);      \
136
0
            return Status::RuntimeError(ss.str());                           \
137
0
        }                                                                    \
138
0
    } while (false)
139
140
#define RETURN_ERROR_IF_PARSING_FAILED(result, col, type)                       \
141
0
    do {                                                                        \
142
0
        if (result != StringParser::PARSE_SUCCESS) {                            \
143
0
            std::stringstream ss;                                               \
144
0
            ss << "Expected value of type: " << type_to_string(type)            \
145
0
               << "; but found type: " << json_type_to_string(col.GetType())    \
146
0
               << "; Document source slice is : " << json_value_to_string(col); \
147
0
            return Status::RuntimeError(ss.str());                              \
148
0
        }                                                                       \
149
0
    } while (false)
150
151
#define RETURN_ERROR_IF_CAST_FORMAT_ERROR(col, type)                     \
152
0
    do {                                                                 \
153
0
        std::stringstream ss;                                            \
154
0
        ss << "Expected value of type: " << type_to_string(type)         \
155
0
           << "; but found type: " << json_type_to_string(col.GetType()) \
156
0
           << "; Document slice is : " << json_value_to_string(col);     \
157
0
        return Status::RuntimeError(ss.str());                           \
158
0
    } while (false)
159
160
template <typename T>
161
Status get_int_value(const rapidjson::Value& col, PrimitiveType type, void* slot,
162
0
                     bool pure_doc_value) {
163
0
    if (col.IsNumber()) {
164
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) < 8 ? col.GetInt() : col.GetInt64());
165
0
        return Status::OK();
166
0
    }
167
168
0
    if (pure_doc_value && col.IsArray() && !col.Empty()) {
169
0
        RETURN_ERROR_IF_COL_IS_NOT_NUMBER(col[0], type);
170
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) < 8 ? col[0].GetInt() : col[0].GetInt64());
171
0
        return Status::OK();
172
0
    }
173
174
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
175
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
176
177
0
    StringParser::ParseResult result;
178
0
    const std::string& val = col.GetString();
179
0
    size_t len = col.GetStringLength();
180
0
    T v = StringParser::string_to_int<T>(val.c_str(), len, &result);
181
0
    RETURN_ERROR_IF_PARSING_FAILED(result, col, type);
182
183
0
    if (sizeof(T) < 16) {
184
0
        *reinterpret_cast<T*>(slot) = v;
185
0
    } else {
186
0
        DCHECK(sizeof(T) == 16);
187
0
        memcpy(slot, &v, sizeof(v));
188
0
    }
189
190
0
    return Status::OK();
191
0
}
Unexecuted instantiation: _ZN5doris13get_int_valueIaEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueIsEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueIiEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueIlEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueInEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
192
193
template <PrimitiveType T>
194
Status get_date_value_int(const rapidjson::Value& col, PrimitiveType type, bool is_date_str,
195
                          typename PrimitiveTypeTraits<T>::CppType* slot,
196
0
                          const cctz::time_zone& time_zone) {
197
0
    constexpr bool is_datetime_v1 = T == TYPE_DATE || T == TYPE_DATETIME;
198
0
    typename PrimitiveTypeTraits<T>::CppType dt_val;
199
0
    if (is_date_str) {
200
0
        const std::string str_date = col.GetString();
201
0
        int str_length = col.GetStringLength();
202
0
        bool success = false;
203
0
        if (str_length > 19) {
204
0
            std::chrono::system_clock::time_point tp;
205
            // time_zone suffix pattern
206
            // Z/+08:00/-04:30
207
0
            RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
208
0
            bool ok = false;
209
0
            std::string fmt;
210
0
            re2::StringPiece value;
211
0
            if (time_zone_pattern.Match(str_date, 0, str_date.size(), RE2::UNANCHORED, &value, 1)) {
212
                // with time_zone info
213
                // YYYY-MM-DDTHH:MM:SSZ or YYYY-MM-DDTHH:MM:SS+08:00
214
                // or 2022-08-08T12:10:10.000Z or YYYY-MM-DDTHH:MM:SS-08:00
215
0
                fmt = "%Y-%m-%dT%H:%M:%E*S%Ez";
216
0
                cctz::time_zone ctz;
217
                // find time_zone by time_zone suffix string
218
0
                TimezoneUtils::find_cctz_time_zone(value.as_string(), ctz);
219
0
                ok = cctz::parse(fmt, str_date, ctz, &tp);
220
0
            } else {
221
                // without time_zone info
222
                // 2022-08-08T12:10:10.000
223
0
                fmt = "%Y-%m-%dT%H:%M:%E*S";
224
                // If the time without time_zone info, ES will assume it is UTC time.
225
                // So we parse it in Doris with UTC time zone.
226
0
                ok = cctz::parse(fmt, str_date, cctz::utc_time_zone(), &tp);
227
0
            }
228
0
            if (ok) {
229
                // The local time zone can change by session variable `time_zone`
230
                // We should use the user specified time zone, not the actual system local time zone.
231
0
                success = true;
232
0
                dt_val.from_unixtime(std::chrono::system_clock::to_time_t(tp), time_zone);
233
0
            }
234
0
        } else if (str_length == 19) {
235
            // YYYY-MM-DDTHH:MM:SS
236
0
            if (*(str_date.c_str() + 10) == 'T') {
237
0
                std::chrono::system_clock::time_point tp;
238
0
                const bool ok =
239
0
                        cctz::parse("%Y-%m-%dT%H:%M:%S", str_date, cctz::utc_time_zone(), &tp);
240
0
                if (ok) {
241
0
                    success = true;
242
0
                    dt_val.from_unixtime(std::chrono::system_clock::to_time_t(tp), time_zone);
243
0
                }
244
0
            } else {
245
                // YYYY-MM-DD HH:MM:SS
246
0
                CastParameters params;
247
0
                if constexpr (is_datetime_v1) {
248
0
                    success = CastToDateOrDatetime::from_string_non_strict_mode<
249
0
                            DatelikeTargetType::DATE_TIME>({str_date.c_str(), (size_t)str_length},
250
0
                                                           dt_val, nullptr, params);
251
0
                } else if constexpr (T == TYPE_DATEV2) {
252
0
                    success = CastToDateV2::from_string_non_strict_mode(
253
0
                            {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, params);
254
0
                } else {
255
0
                    success = CastToDatetimeV2::from_string_non_strict_mode(
256
0
                            {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, -1, params);
257
0
                }
258
0
            }
259
260
0
        } else if (str_length == 13) {
261
            // string long like "1677895728000"
262
0
            int64_t time_long = std::atol(str_date.c_str());
263
0
            if (time_long > 0) {
264
0
                success = true;
265
0
                dt_val.from_unixtime(time_long / 1000, time_zone);
266
0
            }
267
0
        } else {
268
            // YYYY-MM-DD or others
269
0
            CastParameters params;
270
0
            if constexpr (is_datetime_v1) {
271
0
                success = CastToDateOrDatetime::from_string_non_strict_mode<
272
0
                        DatelikeTargetType::DATE_TIME>({str_date.c_str(), (size_t)str_length},
273
0
                                                       dt_val, nullptr, params);
274
0
            } else if constexpr (T == TYPE_DATEV2) {
275
0
                success = CastToDateV2::from_string_non_strict_mode(
276
0
                        {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, params);
277
0
            } else {
278
0
                success = CastToDatetimeV2::from_string_non_strict_mode(
279
0
                        {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, -1, params);
280
0
            }
281
0
        }
282
283
0
        if (!success) {
284
0
            RETURN_ERROR_IF_CAST_FORMAT_ERROR(col, type);
285
0
        }
286
287
0
    } else {
288
0
        dt_val.from_unixtime(col.GetInt64() / 1000, time_zone);
289
0
    }
290
0
    if constexpr (is_datetime_v1) {
291
0
        if (type == TYPE_DATE) {
292
0
            dt_val.cast_to_date();
293
0
        } else {
294
0
            dt_val.to_datetime();
295
0
        }
296
0
    }
297
298
0
    *slot = *reinterpret_cast<typename PrimitiveTypeTraits<T>::CppType*>(&dt_val);
299
0
    return Status::OK();
300
0
}
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE11EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE12EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
301
302
template <PrimitiveType T>
303
Status get_date_int(const rapidjson::Value& col, PrimitiveType type, bool pure_doc_value,
304
                    typename PrimitiveTypeTraits<T>::CppType* slot,
305
0
                    const cctz::time_zone& time_zone) {
306
    // this would happend just only when `enable_docvalue_scan = false`, and field has timestamp format date from _source
307
0
    if (col.IsNumber()) {
308
        // ES process date/datetime field would use millisecond timestamp for index or docvalue
309
        // processing date type field, if a number is encountered, Doris On ES will force it to be processed according to ms
310
        // Doris On ES needs to be consistent with ES, so just divided by 1000 because the unit for from_unixtime is seconds
311
0
        return get_date_value_int<T>(col, type, false, slot, time_zone);
312
0
    } else if (col.IsArray() && pure_doc_value && !col.Empty()) {
313
        // this would happened just only when `enable_docvalue_scan = true`
314
        // ES add default format for all field after ES 6.4, if we not provided format for `date` field ES would impose
315
        // a standard date-format for date field as `2020-06-16T00:00:00.000Z`
316
        // At present, we just process this string format date. After some PR were merged into Doris, we would impose `epoch_mills` for
317
        // date field's docvalue
318
0
        if (col[0].IsString()) {
319
0
            return get_date_value_int<T>(col[0], type, true, slot, time_zone);
320
0
        }
321
        // ES would return millisecond timestamp for date field, divided by 1000 because the unit for from_unixtime is seconds
322
0
        return get_date_value_int<T>(col[0], type, false, slot, time_zone);
323
0
    } else {
324
        // this would happened just only when `enable_docvalue_scan = false`, and field has string format date from _source
325
0
        RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
326
0
        RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
327
0
        return get_date_value_int<T>(col, type, true, slot, time_zone);
328
0
    }
329
0
}
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE11EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE12EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
330
template <PrimitiveType T>
331
Status fill_date_int(const rapidjson::Value& col, PrimitiveType type, bool pure_doc_value,
332
0
                     IColumn* col_ptr, const cctz::time_zone& time_zone) {
333
0
    typename PrimitiveTypeTraits<T>::CppType data;
334
0
    RETURN_IF_ERROR((get_date_int<T>(col, type, pure_doc_value, &data, time_zone)));
335
0
    col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&data)), 0);
336
0
    return Status::OK();
337
0
}
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE11EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE12EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
338
339
template <typename T>
340
Status get_float_value(const rapidjson::Value& col, PrimitiveType type, void* slot,
341
0
                       bool pure_doc_value) {
342
0
    static_assert(sizeof(T) == 4 || sizeof(T) == 8);
343
0
    if (col.IsNumber()) {
344
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) == 4 ? col.GetFloat() : col.GetDouble());
345
0
        return Status::OK();
346
0
    }
347
348
0
    if (pure_doc_value && col.IsArray() && !col.Empty()) {
349
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) == 4 ? col[0].GetFloat() : col[0].GetDouble());
350
0
        return Status::OK();
351
0
    }
352
353
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
354
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
355
356
0
    StringParser::ParseResult result;
357
0
    const std::string& val = col.GetString();
358
0
    size_t len = col.GetStringLength();
359
0
    T v = StringParser::string_to_float<T>(val.c_str(), len, &result);
360
0
    RETURN_ERROR_IF_PARSING_FAILED(result, col, type);
361
0
    *reinterpret_cast<T*>(slot) = v;
362
363
0
    return Status::OK();
364
0
}
Unexecuted instantiation: _ZN5doris15get_float_valueIfEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris15get_float_valueIdEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
365
366
template <typename T>
367
Status insert_float_value(const rapidjson::Value& col, PrimitiveType type, IColumn* col_ptr,
368
0
                          bool pure_doc_value, bool nullable) {
369
0
    static_assert(sizeof(T) == 4 || sizeof(T) == 8);
370
0
    if (col.IsNumber() && nullable) {
371
0
        T value = (T)(sizeof(T) == 4 ? col.GetFloat() : col.GetDouble());
372
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
373
0
        return Status::OK();
374
0
    }
375
376
0
    if (pure_doc_value && col.IsArray() && !col.Empty() && nullable) {
377
0
        T value = (T)(sizeof(T) == 4 ? col[0].GetFloat() : col[0].GetDouble());
378
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
379
0
        return Status::OK();
380
0
    }
381
382
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
383
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
384
385
0
    StringParser::ParseResult result;
386
0
    const std::string& val = col.GetString();
387
0
    size_t len = col.GetStringLength();
388
0
    T v = StringParser::string_to_float<T>(val.c_str(), len, &result);
389
0
    RETURN_ERROR_IF_PARSING_FAILED(result, col, type);
390
391
0
    col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&v)), 0);
392
393
0
    return Status::OK();
394
0
}
Unexecuted instantiation: _ZN5doris18insert_float_valueIdEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris18insert_float_valueIfEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
395
396
template <typename T>
397
Status insert_int_value(const rapidjson::Value& col, PrimitiveType type, IColumn* col_ptr,
398
0
                        bool pure_doc_value, bool nullable) {
399
0
    if (col.IsNumber()) {
400
0
        T value;
401
        // ES allows inserting float and double in int/long types.
402
        // To parse these numbers in Doris, we direct cast them to int types.
403
0
        if (col.IsDouble()) {
404
0
            value = static_cast<T>(col.GetDouble());
405
0
        } else if (col.IsFloat()) {
406
0
            value = static_cast<T>(col.GetFloat());
407
0
        } else {
408
0
            value = (T)(sizeof(T) < 8 ? col.GetInt() : col.GetInt64());
409
0
        }
410
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
411
0
        return Status::OK();
412
0
    }
413
414
0
    auto parse_and_insert_data = [&](const rapidjson::Value& col_value) -> Status {
415
0
        StringParser::ParseResult result;
416
0
        std::string val = col_value.GetString();
417
        // ES allows inserting numbers and characters containing decimals in numeric types.
418
        // To parse these numbers in Doris, we remove the decimals here.
419
0
        size_t pos = val.find('.');
420
0
        if (pos != std::string::npos) {
421
0
            val = val.substr(0, pos);
422
0
        }
423
0
        size_t len = val.length();
424
0
        T v = StringParser::string_to_int<T>(val.c_str(), len, &result);
425
0
        RETURN_ERROR_IF_PARSING_FAILED(result, col_value, type);
426
427
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&v)), 0);
428
0
        return Status::OK();
429
0
    };
Unexecuted instantiation: _ZZN5doris16insert_int_valueIaEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueIsEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueIiEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueIlEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueInEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
430
431
0
    if (pure_doc_value && col.IsArray() && !col.Empty()) {
432
0
        if (col[0].IsNumber()) {
433
0
            T value = (T)(sizeof(T) < 8 ? col[0].GetInt() : col[0].GetInt64());
434
0
            col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
435
0
            return Status::OK();
436
0
        } else {
437
0
            RETURN_ERROR_IF_COL_IS_ARRAY(col[0], type, true);
438
0
            RETURN_ERROR_IF_COL_IS_NOT_STRING(col[0], type);
439
0
            return parse_and_insert_data(col[0]);
440
0
        }
441
0
    }
442
443
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
444
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
445
0
    return parse_and_insert_data(col);
446
0
}
Unexecuted instantiation: _ZN5doris16insert_int_valueIaEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueIsEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueIiEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueIlEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueInEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
447
448
template <PrimitiveType T>
449
Status handle_value(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
450
0
                    typename PrimitiveTypeTraits<T>::CppType& val) {
451
    if constexpr (T == TYPE_TINYINT || T == TYPE_SMALLINT || T == TYPE_INT || T == TYPE_BIGINT ||
452
0
                  T == TYPE_LARGEINT) {
453
0
        RETURN_IF_ERROR(get_int_value<typename PrimitiveTypeTraits<T>::CppType>(col, sub_type, &val,
454
0
                                                                                pure_doc_value));
455
0
        return Status::OK();
456
0
    }
457
0
    if constexpr (T == TYPE_FLOAT) {
458
0
        RETURN_IF_ERROR(get_float_value<float>(col, sub_type, &val, pure_doc_value));
459
0
        return Status::OK();
460
0
    }
461
0
    if constexpr (T == TYPE_DOUBLE) {
462
0
        RETURN_IF_ERROR(get_float_value<double>(col, sub_type, &val, pure_doc_value));
463
0
        return Status::OK();
464
0
    }
465
0
    if constexpr (T == TYPE_STRING || T == TYPE_CHAR || T == TYPE_VARCHAR) {
466
        // When ES mapping is keyword/text but actual data is an array,
467
        // serialize the array to JSON string instead of throwing an error.
468
        // This is valid in ES since any field can hold array values.
469
0
        if (col.IsArray()) {
470
0
            val = json_value_to_string(col);
471
0
        } else if (!col.IsString()) {
472
0
            val = json_value_to_string(col);
473
0
        } else {
474
0
            val = col.GetString();
475
0
        }
476
0
        return Status::OK();
477
0
    }
478
0
    if constexpr (T == TYPE_BOOLEAN) {
479
0
        if (col.IsBool()) {
480
0
            val = col.GetBool();
481
0
            return Status::OK();
482
0
        }
483
484
0
        if (col.IsNumber()) {
485
0
            val = static_cast<typename PrimitiveTypeTraits<T>::CppType>(col.GetInt());
486
0
            return Status::OK();
487
0
        }
488
489
0
        bool is_nested_str = false;
490
0
        if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsBool()) {
491
0
            val = col[0].GetBool();
492
0
            return Status::OK();
493
0
        } else if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsString()) {
494
0
            is_nested_str = true;
495
0
        } else if (pure_doc_value && col.IsArray()) {
496
0
            return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN");
497
0
        }
498
499
0
        const rapidjson::Value& str_col = is_nested_str ? col[0] : col;
500
0
        const std::string& str_val = str_col.GetString();
501
0
        size_t val_size = str_col.GetStringLength();
502
0
        StringParser::ParseResult result;
503
0
        val = StringParser::string_to_bool(str_val.c_str(), val_size, &result);
504
0
        RETURN_ERROR_IF_PARSING_FAILED(result, str_col, sub_type);
505
0
        return Status::OK();
506
0
    }
507
0
    throw Exception(ErrorCode::INTERNAL_ERROR, "Un-supported type: {}", type_to_string(T));
508
0
}
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
509
510
template <PrimitiveType T>
511
Status process_single_column(const rapidjson::Value& col, PrimitiveType sub_type,
512
0
                             bool pure_doc_value, Array& array) {
513
0
    typename PrimitiveTypeTraits<T>::CppType val;
514
0
    RETURN_IF_ERROR(handle_value<T>(col, sub_type, pure_doc_value, val));
515
0
    array.push_back(Field::create_field<T>(val));
516
0
    return Status::OK();
517
0
}
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
518
519
template <PrimitiveType T>
520
Status process_column_array(const rapidjson::Value& col, PrimitiveType sub_type,
521
0
                            bool pure_doc_value, Array& array) {
522
0
    for (const auto& sub_col : col.GetArray()) {
523
0
        RETURN_IF_ERROR(process_single_column<T>(sub_col, sub_type, pure_doc_value, array));
524
0
    }
525
0
    return Status::OK();
526
0
}
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
527
528
template <PrimitiveType T>
529
Status process_column(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
530
0
                      Array& array) {
531
0
    if (!col.IsArray()) {
532
0
        return process_single_column<T>(col, sub_type, pure_doc_value, array);
533
0
    } else {
534
0
        return process_column_array<T>(col, sub_type, pure_doc_value, array);
535
0
    }
536
0
}
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
537
538
template <PrimitiveType T>
539
Status process_date_column(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
540
0
                           Array& array, const cctz::time_zone& time_zone) {
541
0
    if (!col.IsArray()) {
542
0
        typename PrimitiveTypeTraits<T>::CppType data;
543
0
        RETURN_IF_ERROR((get_date_int<T>(col, sub_type, pure_doc_value, &data, time_zone)));
544
0
        array.push_back(Field::create_field<T>(data));
545
0
    } else {
546
0
        for (const auto& sub_col : col.GetArray()) {
547
0
            typename PrimitiveTypeTraits<T>::CppType data;
548
0
            RETURN_IF_ERROR((get_date_int<T>(sub_col, sub_type, pure_doc_value, &data, time_zone)));
549
0
            array.push_back(Field::create_field<T>(data));
550
0
        }
551
0
    }
552
0
    return Status::OK();
553
0
}
Unexecuted instantiation: _ZN5doris19process_date_columnILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris19process_date_columnILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayERKN4cctz9time_zoneE
554
555
Status process_jsonb_column(const rapidjson::Value& col, PrimitiveType sub_type,
556
0
                            bool pure_doc_value, Array& array) {
557
0
    if (!col.IsArray()) {
558
0
        JsonBinaryValue jsonb_value;
559
0
        RETURN_IF_ERROR(jsonb_value.from_json_string(json_value_to_string(col)));
560
0
        JsonbField json(jsonb_value.value(), jsonb_value.size());
561
0
        array.push_back(Field::create_field<TYPE_JSONB>(std::move(json)));
562
0
    } else {
563
0
        for (const auto& sub_col : col.GetArray()) {
564
0
            JsonBinaryValue jsonb_value;
565
0
            RETURN_IF_ERROR(jsonb_value.from_json_string(json_value_to_string(sub_col)));
566
0
            JsonbField json(jsonb_value.value(), jsonb_value.size());
567
0
            array.push_back(Field::create_field<TYPE_JSONB>(json));
568
0
        }
569
0
    }
570
0
    return Status::OK();
571
0
}
572
573
Status ScrollParser::parse_column(const rapidjson::Value& col, PrimitiveType sub_type,
574
                                  bool pure_doc_value, Array& array,
575
0
                                  const cctz::time_zone& time_zone) {
576
0
    switch (sub_type) {
577
0
    case TYPE_CHAR:
578
0
    case TYPE_VARCHAR:
579
0
    case TYPE_STRING:
580
0
        return process_column<TYPE_STRING>(col, sub_type, pure_doc_value, array);
581
0
    case TYPE_TINYINT:
582
0
        return process_column<TYPE_TINYINT>(col, sub_type, pure_doc_value, array);
583
0
    case TYPE_SMALLINT:
584
0
        return process_column<TYPE_SMALLINT>(col, sub_type, pure_doc_value, array);
585
0
    case TYPE_INT:
586
0
        return process_column<TYPE_INT>(col, sub_type, pure_doc_value, array);
587
0
    case TYPE_BIGINT:
588
0
        return process_column<TYPE_BIGINT>(col, sub_type, pure_doc_value, array);
589
0
    case TYPE_LARGEINT:
590
0
        return process_column<TYPE_LARGEINT>(col, sub_type, pure_doc_value, array);
591
0
    case TYPE_FLOAT:
592
0
        return process_column<TYPE_FLOAT>(col, sub_type, pure_doc_value, array);
593
0
    case TYPE_DOUBLE:
594
0
        return process_column<TYPE_DOUBLE>(col, sub_type, pure_doc_value, array);
595
0
    case TYPE_BOOLEAN:
596
0
        return process_column<TYPE_BOOLEAN>(col, sub_type, pure_doc_value, array);
597
    // date/datetime v2 is the default type for catalog table,
598
    // see https://github.com/apache/doris/pull/16304
599
    // No need to support date and datetime types.
600
0
    case TYPE_DATEV2: {
601
0
        return process_date_column<TYPE_DATEV2>(col, sub_type, pure_doc_value, array, time_zone);
602
0
    }
603
0
    case TYPE_DATETIMEV2: {
604
0
        return process_date_column<TYPE_DATETIMEV2>(col, sub_type, pure_doc_value, array,
605
0
                                                    time_zone);
606
0
    }
607
0
    case TYPE_JSONB: {
608
0
        return process_jsonb_column(col, sub_type, pure_doc_value, array);
609
0
    }
610
0
    default:
611
0
        LOG(ERROR) << "Do not support Array type: " << sub_type;
612
0
        return Status::InternalError("Unsupported type");
613
0
    }
614
0
}
615
616
0
ScrollParser::ScrollParser(bool doc_value_mode) : _size(0), _line_index(0) {}
617
618
0
ScrollParser::~ScrollParser() = default;
619
620
0
Status ScrollParser::parse(const std::string& scroll_result, bool exactly_once) {
621
    // rely on `_size !=0 ` to determine whether scroll ends
622
0
    _size = 0;
623
0
    _document_node.Parse(scroll_result.c_str(), scroll_result.length());
624
0
    if (_document_node.HasParseError()) {
625
0
        return Status::InternalError("Parsing json error, json is: {}", scroll_result);
626
0
    }
627
628
0
    if (!exactly_once && !_document_node.HasMember(FIELD_SCROLL_ID)) {
629
0
        LOG(WARNING) << "Document has not a scroll id field scroll response:" << scroll_result;
630
0
        return Status::InternalError("Document has not a scroll id field");
631
0
    }
632
633
0
    if (!exactly_once) {
634
0
        const rapidjson::Value& scroll_node = _document_node[FIELD_SCROLL_ID];
635
0
        _scroll_id = scroll_node.GetString();
636
0
    }
637
    // { hits: { total : 2, "hits" : [ {}, {}, {} ]}}
638
0
    const rapidjson::Value& outer_hits_node = _document_node[FIELD_HITS];
639
    // if has no inner hits, there has no data in this index
640
0
    if (!outer_hits_node.HasMember(FIELD_INNER_HITS)) {
641
0
        return Status::OK();
642
0
    }
643
0
    const rapidjson::Value& inner_hits_node = outer_hits_node[FIELD_INNER_HITS];
644
    // this happened just the end of scrolling
645
0
    if (!inner_hits_node.IsArray()) {
646
0
        return Status::OK();
647
0
    }
648
0
    _inner_hits_node.CopyFrom(inner_hits_node, _document_node.GetAllocator());
649
    // how many documents contains in this batch
650
0
    _size = _inner_hits_node.Size();
651
0
    return Status::OK();
652
0
}
653
654
0
int ScrollParser::get_size() const {
655
0
    return _size;
656
0
}
657
658
0
const std::string& ScrollParser::get_scroll_id() {
659
0
    return _scroll_id;
660
0
}
661
662
Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc,
663
                                  std::vector<MutableColumnPtr>& columns, bool* line_eof,
664
                                  const std::map<std::string, std::string>& docvalue_context,
665
0
                                  const cctz::time_zone& time_zone) {
666
0
    *line_eof = true;
667
668
0
    if (_size <= 0 || _line_index >= _size) {
669
0
        return Status::OK();
670
0
    }
671
672
0
    const rapidjson::Value& obj = _inner_hits_node[_line_index++];
673
0
    bool pure_doc_value = false;
674
0
    if (obj.HasMember("fields")) {
675
0
        pure_doc_value = true;
676
0
    }
677
    // obj may be neither have `_source` nor `fields` field.
678
0
    const rapidjson::Value* line = nullptr;
679
0
    if (obj.HasMember(FIELD_SOURCE)) {
680
0
        line = &obj[FIELD_SOURCE];
681
0
    } else if (obj.HasMember("fields")) {
682
0
        line = &obj["fields"];
683
0
    }
684
685
0
    for (int i = 0; i < tuple_desc->slots().size(); ++i) {
686
0
        const SlotDescriptor* slot_desc = tuple_desc->slots()[i];
687
0
        auto* col_ptr = columns[i].get();
688
689
0
        if (slot_desc->col_name() == FIELD_ID) {
690
            // actually this branch will not be reached, this is guaranteed by Doris FE.
691
0
            if (pure_doc_value) {
692
0
                return Status::RuntimeError("obtain `_id` is not supported in doc_values mode");
693
0
            }
694
            // obj[FIELD_ID] must not be NULL
695
0
            std::string _id = obj[FIELD_ID].GetString();
696
0
            size_t len = _id.length();
697
698
0
            col_ptr->insert_data(const_cast<const char*>(_id.data()), len);
699
0
            continue;
700
0
        }
701
702
0
        const char* col_name = pure_doc_value ? docvalue_context.at(slot_desc->col_name()).c_str()
703
0
                                              : slot_desc->col_name().c_str();
704
705
0
        if (line == nullptr || line->FindMember(col_name) == line->MemberEnd()) {
706
0
            if (slot_desc->is_nullable()) {
707
0
                auto* nullable_column = reinterpret_cast<ColumnNullable*>(col_ptr);
708
0
                nullable_column->insert_data(nullptr, 0);
709
0
                continue;
710
0
            } else {
711
0
                std::string details = absl::Substitute(INVALID_NULL_VALUE, col_name);
712
0
                return Status::RuntimeError(details);
713
0
            }
714
0
        }
715
716
0
        const rapidjson::Value& col = (*line)[col_name];
717
718
0
        auto type = slot_desc->type()->get_primitive_type();
719
720
        // when the column value is null, the subsequent type casting will report an error
721
0
        if (col.IsNull() && slot_desc->is_nullable()) {
722
0
            col_ptr->insert_data(nullptr, 0);
723
0
            continue;
724
0
        } else if (col.IsNull() && !slot_desc->is_nullable()) {
725
0
            std::string details = absl::Substitute(INVALID_NULL_VALUE, col_name);
726
0
            return Status::RuntimeError(details);
727
0
        }
728
0
        switch (type) {
729
0
        case TYPE_CHAR:
730
0
        case TYPE_VARCHAR:
731
0
        case TYPE_STRING: {
732
            // sometimes elasticsearch user post some not-string value to Elasticsearch Index.
733
            // because of reading value from _source, we can not process all json type and then just transfer the value to original string representation
734
            // this may be a tricky, but we can work around this issue
735
0
            std::string val;
736
0
            if (pure_doc_value) {
737
0
                if (col.Empty()) {
738
0
                    break;
739
0
                } else if (col.Size() > 1) {
740
                    // doc_values with multiple elements means actual array data
741
                    // in ES keyword/text field, serialize as JSON array string
742
0
                    val = json_value_to_string(col);
743
0
                } else if (!col[0].IsString()) {
744
0
                    val = json_value_to_string(col[0]);
745
0
                } else {
746
0
                    val = col[0].GetString();
747
0
                }
748
0
            } else {
749
                // When ES mapping is keyword/text but actual data is an array,
750
                // serialize the array to JSON string instead of throwing an error.
751
                // This is valid in ES since any field can hold array values.
752
0
                if (col.IsArray()) {
753
0
                    val = json_value_to_string(col);
754
0
                } else if (!col.IsString()) {
755
0
                    val = json_value_to_string(col);
756
0
                } else {
757
0
                    val = col.GetString();
758
0
                }
759
0
            }
760
0
            size_t val_size = val.length();
761
0
            col_ptr->insert_data(const_cast<const char*>(val.data()), val_size);
762
0
            break;
763
0
        }
764
765
0
        case TYPE_TINYINT: {
766
0
            RETURN_IF_ERROR(insert_int_value<int8_t>(col, type, col_ptr, pure_doc_value,
767
0
                                                     slot_desc->is_nullable()));
768
0
            break;
769
0
        }
770
771
0
        case TYPE_SMALLINT: {
772
0
            RETURN_IF_ERROR(insert_int_value<int16_t>(col, type, col_ptr, pure_doc_value,
773
0
                                                      slot_desc->is_nullable()));
774
0
            break;
775
0
        }
776
777
0
        case TYPE_INT: {
778
0
            RETURN_IF_ERROR(insert_int_value<int32_t>(col, type, col_ptr, pure_doc_value,
779
0
                                                      slot_desc->is_nullable()));
780
0
            break;
781
0
        }
782
783
0
        case TYPE_BIGINT: {
784
0
            RETURN_IF_ERROR(insert_int_value<int64_t>(col, type, col_ptr, pure_doc_value,
785
0
                                                      slot_desc->is_nullable()));
786
0
            break;
787
0
        }
788
789
0
        case TYPE_LARGEINT: {
790
0
            RETURN_IF_ERROR(insert_int_value<__int128>(col, type, col_ptr, pure_doc_value,
791
0
                                                       slot_desc->is_nullable()));
792
0
            break;
793
0
        }
794
795
0
        case TYPE_DOUBLE: {
796
0
            RETURN_IF_ERROR(insert_float_value<double>(col, type, col_ptr, pure_doc_value,
797
0
                                                       slot_desc->is_nullable()));
798
0
            break;
799
0
        }
800
801
0
        case TYPE_FLOAT: {
802
0
            RETURN_IF_ERROR(insert_float_value<float>(col, type, col_ptr, pure_doc_value,
803
0
                                                      slot_desc->is_nullable()));
804
0
            break;
805
0
        }
806
807
0
        case TYPE_BOOLEAN: {
808
0
            if (col.IsBool()) {
809
0
                int8_t val = col.GetBool();
810
0
                col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
811
0
                break;
812
0
            }
813
814
0
            if (col.IsNumber()) {
815
0
                int8_t val = static_cast<int8_t>(col.GetInt());
816
0
                col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
817
0
                break;
818
0
            }
819
820
0
            bool is_nested_str = false;
821
0
            if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsBool()) {
822
0
                int8_t val = col[0].GetBool();
823
0
                col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
824
0
                break;
825
0
            } else if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsString()) {
826
0
                is_nested_str = true;
827
0
            } else if (pure_doc_value && col.IsArray()) {
828
0
                return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN");
829
0
            }
830
831
0
            const rapidjson::Value& str_col = is_nested_str ? col[0] : col;
832
833
0
            RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
834
835
0
            const std::string& val = str_col.GetString();
836
0
            size_t val_size = str_col.GetStringLength();
837
0
            StringParser::ParseResult result;
838
0
            bool b = StringParser::string_to_bool(val.c_str(), val_size, &result);
839
0
            RETURN_ERROR_IF_PARSING_FAILED(result, str_col, type);
840
0
            col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&b)), 0);
841
0
            break;
842
0
        }
843
0
        case TYPE_DECIMALV2: {
844
0
            DecimalV2Value data;
845
846
0
            if (col.IsDouble()) {
847
0
                data.assign_from_double(col.GetDouble());
848
0
            } else {
849
0
                std::string val;
850
0
                if (pure_doc_value) {
851
0
                    if (col.Empty()) {
852
0
                        break;
853
0
                    } else if (!col[0].IsString()) {
854
0
                        val = json_value_to_string(col[0]);
855
0
                    } else {
856
0
                        val = col[0].GetString();
857
0
                    }
858
0
                } else {
859
0
                    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
860
0
                    if (!col.IsString()) {
861
0
                        val = json_value_to_string(col);
862
0
                    } else {
863
0
                        val = col.GetString();
864
0
                    }
865
0
                }
866
0
                data.parse_from_str(val.data(), static_cast<int32_t>(val.length()));
867
0
            }
868
0
            col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&data)), 0);
869
0
            break;
870
0
        }
871
872
0
        case TYPE_DATE:
873
0
            RETURN_IF_ERROR(
874
0
                    fill_date_int<TYPE_DATE>(col, type, pure_doc_value, col_ptr, time_zone));
875
0
            break;
876
0
        case TYPE_DATETIME:
877
0
            RETURN_IF_ERROR(
878
0
                    fill_date_int<TYPE_DATETIME>(col, type, pure_doc_value, col_ptr, time_zone));
879
0
            break;
880
0
        case TYPE_DATEV2:
881
0
            RETURN_IF_ERROR(
882
0
                    fill_date_int<TYPE_DATEV2>(col, type, pure_doc_value, col_ptr, time_zone));
883
0
            break;
884
0
        case TYPE_DATETIMEV2: {
885
0
            RETURN_IF_ERROR(
886
0
                    fill_date_int<TYPE_DATETIMEV2>(col, type, pure_doc_value, col_ptr, time_zone));
887
0
            break;
888
0
        }
889
0
        case TYPE_ARRAY: {
890
0
            Array array;
891
0
            const auto& sub_type = assert_cast<const DataTypeArray*>(
892
0
                                           remove_nullable(tuple_desc->slots()[i]->type()).get())
893
0
                                           ->get_nested_type()
894
0
                                           ->get_primitive_type();
895
0
            RETURN_IF_ERROR(parse_column(col, sub_type, pure_doc_value, array, time_zone));
896
0
            col_ptr->insert(Field::create_field<TYPE_ARRAY>(array));
897
0
            break;
898
0
        }
899
0
        case TYPE_JSONB: {
900
0
            JsonBinaryValue jsonb_value;
901
0
            RETURN_IF_ERROR(jsonb_value.from_json_string(json_value_to_string(col)));
902
0
            JsonbField json(jsonb_value.value(), jsonb_value.size());
903
0
            col_ptr->insert(Field::create_field<TYPE_JSONB>(json));
904
0
            break;
905
0
        }
906
0
        default: {
907
0
            LOG(ERROR) << "Unsupported data type: " << type_to_string(type);
908
0
            DCHECK(false);
909
0
            break;
910
0
        }
911
0
        }
912
0
    }
913
914
0
    *line_eof = false;
915
0
    return Status::OK();
916
0
}
917
#include "common/compile_check_end.h"
918
} // namespace doris