Coverage Report

Created: 2026-04-10 04:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/es/es_scroll_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/es/es_scroll_parser.h"
19
20
#include <absl/strings/substitute.h>
21
#include <cctz/time_zone.h>
22
#include <glog/logging.h>
23
#include <rapidjson/allocators.h>
24
#include <rapidjson/encodings.h>
25
#include <stdint.h>
26
#include <string.h>
27
28
// IWYU pragma: no_include <bits/chrono.h>
29
#include <chrono> // IWYU pragma: keep
30
#include <cstdlib>
31
#include <ostream>
32
#include <string>
33
34
#include "common/status.h"
35
#include "core/binary_cast.hpp"
36
#include "core/column/column.h"
37
#include "core/column/column_nullable.h"
38
#include "core/data_type/data_type_array.h"
39
#include "core/data_type/data_type_nullable.h"
40
#include "core/data_type/define_primitive_type.h"
41
#include "core/data_type/primitive_type.h"
42
#include "core/field.h"
43
#include "core/value/decimalv2_value.h"
44
#include "core/value/jsonb_value.h"
45
#include "core/value/vdatetime_value.h"
46
#include "exprs/function/cast/cast_to_date_or_datetime_impl.hpp"
47
#include "exprs/function/cast/cast_to_datetimev2_impl.hpp"
48
#include "exprs/function/cast/cast_to_datev2_impl.hpp"
49
#include "rapidjson/document.h"
50
#include "rapidjson/rapidjson.h"
51
#include "rapidjson/stringbuffer.h"
52
#include "rapidjson/writer.h"
53
#include "runtime/descriptors.h"
54
#include "util/string_parser.hpp"
55
56
namespace doris {
57
58
static const char* FIELD_SCROLL_ID = "_scroll_id";
59
static const char* FIELD_HITS = "hits";
60
static const char* FIELD_INNER_HITS = "hits";
61
static const char* FIELD_SOURCE = "_source";
62
static const char* FIELD_ID = "_id";
63
64
// get the original json data type
65
0
std::string json_type_to_string(rapidjson::Type type) {
66
0
    switch (type) {
67
0
    case rapidjson::kNumberType:
68
0
        return "Number";
69
0
    case rapidjson::kStringType:
70
0
        return "Varchar/Char";
71
0
    case rapidjson::kArrayType:
72
0
        return "Array";
73
0
    case rapidjson::kObjectType:
74
0
        return "Object";
75
0
    case rapidjson::kNullType:
76
0
        return "Null Type";
77
0
    case rapidjson::kFalseType:
78
0
    case rapidjson::kTrueType:
79
0
        return "True/False";
80
0
    default:
81
0
        return "Unknown Type";
82
0
    }
83
0
}
84
85
// transfer rapidjson::Value to string representation
86
0
std::string json_value_to_string(const rapidjson::Value& value) {
87
0
    rapidjson::StringBuffer scratch_buffer;
88
0
    rapidjson::Writer<rapidjson::StringBuffer> temp_writer(scratch_buffer);
89
0
    value.Accept(temp_writer);
90
0
    return scratch_buffer.GetString();
91
0
}
92
93
static const std::string ERROR_INVALID_COL_DATA =
94
        "Data source returned inconsistent column data. "
95
        "Expected value of type {} based on column metadata. This likely indicates a "
96
        "problem with the data source library.";
97
static const std::string ERROR_MEM_LIMIT_EXCEEDED =
98
        "DataSourceScanNode::$0() failed to allocate "
99
        "$1 bytes for $2.";
100
static const std::string ERROR_COL_DATA_IS_ARRAY =
101
        "Data source returned an array for the type $0"
102
        "based on column metadata.";
103
static const std::string INVALID_NULL_VALUE =
104
        "Invalid null value occurs: Non-null column `$0` contains NULL";
105
106
#define RETURN_ERROR_IF_COL_IS_ARRAY(col, type, is_array)                    \
107
0
    do {                                                                     \
108
0
        if (col.IsArray() == is_array) {                                     \
109
0
            std::stringstream ss;                                            \
110
0
            ss << "Expected value of type: " << type_to_string(type)         \
111
0
               << "; but found type: " << json_type_to_string(col.GetType()) \
112
0
               << "; Document slice is : " << json_value_to_string(col);     \
113
0
            return Status::RuntimeError(ss.str());                           \
114
0
        }                                                                    \
115
0
    } while (false)
116
117
#define RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type)                            \
118
0
    do {                                                                        \
119
0
        if (!col.IsString()) {                                                  \
120
0
            std::stringstream ss;                                               \
121
0
            ss << "Expected value of type: " << type_to_string(type)            \
122
0
               << "; but found type: " << json_type_to_string(col.GetType())    \
123
0
               << "; Document source slice is : " << json_value_to_string(col); \
124
0
            return Status::RuntimeError(ss.str());                              \
125
0
        }                                                                       \
126
0
    } while (false)
127
128
#define RETURN_ERROR_IF_COL_IS_NOT_NUMBER(col, type)                         \
129
0
    do {                                                                     \
130
0
        if (!col.IsNumber()) {                                               \
131
0
            std::stringstream ss;                                            \
132
0
            ss << "Expected value of type: " << type_to_string(type)         \
133
0
               << "; but found type: " << json_type_to_string(col.GetType()) \
134
0
               << "; Document value is: " << json_value_to_string(col);      \
135
0
            return Status::RuntimeError(ss.str());                           \
136
0
        }                                                                    \
137
0
    } while (false)
138
139
#define RETURN_ERROR_IF_PARSING_FAILED(result, col, type)                       \
140
0
    do {                                                                        \
141
0
        if (result != StringParser::PARSE_SUCCESS) {                            \
142
0
            std::stringstream ss;                                               \
143
0
            ss << "Expected value of type: " << type_to_string(type)            \
144
0
               << "; but found type: " << json_type_to_string(col.GetType())    \
145
0
               << "; Document source slice is : " << json_value_to_string(col); \
146
0
            return Status::RuntimeError(ss.str());                              \
147
0
        }                                                                       \
148
0
    } while (false)
149
150
#define RETURN_ERROR_IF_CAST_FORMAT_ERROR(col, type)                     \
151
0
    do {                                                                 \
152
0
        std::stringstream ss;                                            \
153
0
        ss << "Expected value of type: " << type_to_string(type)         \
154
0
           << "; but found type: " << json_type_to_string(col.GetType()) \
155
0
           << "; Document slice is : " << json_value_to_string(col);     \
156
0
        return Status::RuntimeError(ss.str());                           \
157
0
    } while (false)
158
159
template <typename T>
160
Status get_int_value(const rapidjson::Value& col, PrimitiveType type, void* slot,
161
0
                     bool pure_doc_value) {
162
0
    if (col.IsNumber()) {
163
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) < 8 ? col.GetInt() : col.GetInt64());
164
0
        return Status::OK();
165
0
    }
166
167
0
    if (pure_doc_value && col.IsArray() && !col.Empty()) {
168
0
        RETURN_ERROR_IF_COL_IS_NOT_NUMBER(col[0], type);
169
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) < 8 ? col[0].GetInt() : col[0].GetInt64());
170
0
        return Status::OK();
171
0
    }
172
173
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
174
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
175
176
0
    StringParser::ParseResult result;
177
0
    const std::string& val = col.GetString();
178
0
    size_t len = col.GetStringLength();
179
0
    T v = StringParser::string_to_int<T>(val.c_str(), len, &result);
180
0
    RETURN_ERROR_IF_PARSING_FAILED(result, col, type);
181
182
0
    if (sizeof(T) < 16) {
183
0
        *reinterpret_cast<T*>(slot) = v;
184
0
    } else {
185
0
        DCHECK(sizeof(T) == 16);
186
0
        memcpy(slot, &v, sizeof(v));
187
0
    }
188
189
0
    return Status::OK();
190
0
}
Unexecuted instantiation: _ZN5doris13get_int_valueIaEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueIsEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueIiEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueIlEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris13get_int_valueInEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
191
192
template <PrimitiveType T>
193
Status get_date_value_int(const rapidjson::Value& col, PrimitiveType type, bool is_date_str,
194
                          typename PrimitiveTypeTraits<T>::CppType* slot,
195
0
                          const cctz::time_zone& time_zone) {
196
0
    constexpr bool is_datetime_v1 = T == TYPE_DATE || T == TYPE_DATETIME;
197
0
    typename PrimitiveTypeTraits<T>::CppType dt_val;
198
0
    if (is_date_str) {
199
0
        const std::string str_date = col.GetString();
200
0
        int str_length = col.GetStringLength();
201
0
        bool success = false;
202
0
        if (str_length > 19) {
203
0
            std::chrono::system_clock::time_point tp;
204
            // time_zone suffix pattern
205
            // Z/+08:00/-04:30
206
0
            RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
207
0
            bool ok = false;
208
0
            std::string fmt;
209
0
            re2::StringPiece value;
210
0
            if (time_zone_pattern.Match(str_date, 0, str_date.size(), RE2::UNANCHORED, &value, 1)) {
211
                // with time_zone info
212
                // YYYY-MM-DDTHH:MM:SSZ or YYYY-MM-DDTHH:MM:SS+08:00
213
                // or 2022-08-08T12:10:10.000Z or YYYY-MM-DDTHH:MM:SS-08:00
214
0
                fmt = "%Y-%m-%dT%H:%M:%E*S%Ez";
215
0
                cctz::time_zone ctz;
216
                // find time_zone by time_zone suffix string
217
0
                TimezoneUtils::find_cctz_time_zone(value.as_string(), ctz);
218
0
                ok = cctz::parse(fmt, str_date, ctz, &tp);
219
0
            } else {
220
                // without time_zone info
221
                // 2022-08-08T12:10:10.000
222
0
                fmt = "%Y-%m-%dT%H:%M:%E*S";
223
                // If the time without time_zone info, ES will assume it is UTC time.
224
                // So we parse it in Doris with UTC time zone.
225
0
                ok = cctz::parse(fmt, str_date, cctz::utc_time_zone(), &tp);
226
0
            }
227
0
            if (ok) {
228
                // The local time zone can change by session variable `time_zone`
229
                // We should use the user specified time zone, not the actual system local time zone.
230
0
                success = true;
231
0
                dt_val.from_unixtime(std::chrono::system_clock::to_time_t(tp), time_zone);
232
0
            }
233
0
        } else if (str_length == 19) {
234
            // YYYY-MM-DDTHH:MM:SS
235
0
            if (*(str_date.c_str() + 10) == 'T') {
236
0
                std::chrono::system_clock::time_point tp;
237
0
                const bool ok =
238
0
                        cctz::parse("%Y-%m-%dT%H:%M:%S", str_date, cctz::utc_time_zone(), &tp);
239
0
                if (ok) {
240
0
                    success = true;
241
0
                    dt_val.from_unixtime(std::chrono::system_clock::to_time_t(tp), time_zone);
242
0
                }
243
0
            } else {
244
                // YYYY-MM-DD HH:MM:SS
245
0
                CastParameters params;
246
0
                if constexpr (is_datetime_v1) {
247
0
                    success = CastToDateOrDatetime::from_string_non_strict_mode<
248
0
                            DatelikeTargetType::DATE_TIME>({str_date.c_str(), (size_t)str_length},
249
0
                                                           dt_val, nullptr, params);
250
0
                } else if constexpr (T == TYPE_DATEV2) {
251
0
                    success = CastToDateV2::from_string_non_strict_mode(
252
0
                            {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, params);
253
0
                } else {
254
0
                    success = CastToDatetimeV2::from_string_non_strict_mode(
255
0
                            {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, -1, params);
256
0
                }
257
0
            }
258
259
0
        } else if (str_length == 13) {
260
            // string long like "1677895728000"
261
0
            int64_t time_long = std::atol(str_date.c_str());
262
0
            if (time_long > 0) {
263
0
                success = true;
264
0
                dt_val.from_unixtime(time_long / 1000, time_zone);
265
0
            }
266
0
        } else {
267
            // YYYY-MM-DD or others
268
0
            CastParameters params;
269
0
            if constexpr (is_datetime_v1) {
270
0
                success = CastToDateOrDatetime::from_string_non_strict_mode<
271
0
                        DatelikeTargetType::DATE_TIME>({str_date.c_str(), (size_t)str_length},
272
0
                                                       dt_val, nullptr, params);
273
0
            } else if constexpr (T == TYPE_DATEV2) {
274
0
                success = CastToDateV2::from_string_non_strict_mode(
275
0
                        {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, params);
276
0
            } else {
277
0
                success = CastToDatetimeV2::from_string_non_strict_mode(
278
0
                        {str_date.c_str(), (size_t)str_length}, dt_val, nullptr, -1, params);
279
0
            }
280
0
        }
281
282
0
        if (!success) {
283
0
            RETURN_ERROR_IF_CAST_FORMAT_ERROR(col, type);
284
0
        }
285
286
0
    } else {
287
0
        dt_val.from_unixtime(col.GetInt64() / 1000, time_zone);
288
0
    }
289
0
    if constexpr (is_datetime_v1) {
290
0
        if (type == TYPE_DATE) {
291
0
            dt_val.cast_to_date();
292
0
        } else {
293
0
            dt_val.to_datetime();
294
0
        }
295
0
    }
296
297
0
    *slot = *reinterpret_cast<typename PrimitiveTypeTraits<T>::CppType*>(&dt_val);
298
0
    return Status::OK();
299
0
}
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE11EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris18get_date_value_intILNS_13PrimitiveTypeE12EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
300
301
template <PrimitiveType T>
302
Status get_date_int(const rapidjson::Value& col, PrimitiveType type, bool pure_doc_value,
303
                    typename PrimitiveTypeTraits<T>::CppType* slot,
304
0
                    const cctz::time_zone& time_zone) {
305
    // this would happend just only when `enable_docvalue_scan = false`, and field has timestamp format date from _source
306
0
    if (col.IsNumber()) {
307
        // ES process date/datetime field would use millisecond timestamp for index or docvalue
308
        // processing date type field, if a number is encountered, Doris On ES will force it to be processed according to ms
309
        // Doris On ES needs to be consistent with ES, so just divided by 1000 because the unit for from_unixtime is seconds
310
0
        return get_date_value_int<T>(col, type, false, slot, time_zone);
311
0
    } else if (col.IsArray() && pure_doc_value && !col.Empty()) {
312
        // this would happened just only when `enable_docvalue_scan = true`
313
        // ES add default format for all field after ES 6.4, if we not provided format for `date` field ES would impose
314
        // a standard date-format for date field as `2020-06-16T00:00:00.000Z`
315
        // At present, we just process this string format date. After some PR were merged into Doris, we would impose `epoch_mills` for
316
        // date field's docvalue
317
0
        if (col[0].IsString()) {
318
0
            return get_date_value_int<T>(col[0], type, true, slot, time_zone);
319
0
        }
320
        // ES would return millisecond timestamp for date field, divided by 1000 because the unit for from_unixtime is seconds
321
0
        return get_date_value_int<T>(col[0], type, false, slot, time_zone);
322
0
    } else {
323
        // this would happened just only when `enable_docvalue_scan = false`, and field has string format date from _source
324
0
        RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
325
0
        RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
326
0
        return get_date_value_int<T>(col, type, true, slot, time_zone);
327
0
    }
328
0
}
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE11EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris12get_date_intILNS_13PrimitiveTypeE12EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_19PrimitiveTypeTraitsIXT_EE7CppTypeERKN4cctz9time_zoneE
329
template <PrimitiveType T>
330
Status fill_date_int(const rapidjson::Value& col, PrimitiveType type, bool pure_doc_value,
331
0
                     IColumn* col_ptr, const cctz::time_zone& time_zone) {
332
0
    typename PrimitiveTypeTraits<T>::CppType data;
333
0
    RETURN_IF_ERROR((get_date_int<T>(col, type, pure_doc_value, &data, time_zone)));
334
0
    col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&data)), 0);
335
0
    return Status::OK();
336
0
}
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE11EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE12EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris13fill_date_intILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bPNS_7IColumnERKN4cctz9time_zoneE
337
338
template <typename T>
339
Status get_float_value(const rapidjson::Value& col, PrimitiveType type, void* slot,
340
0
                       bool pure_doc_value) {
341
0
    static_assert(sizeof(T) == 4 || sizeof(T) == 8);
342
0
    if (col.IsNumber()) {
343
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) == 4 ? col.GetFloat() : col.GetDouble());
344
0
        return Status::OK();
345
0
    }
346
347
0
    if (pure_doc_value && col.IsArray() && !col.Empty()) {
348
0
        *reinterpret_cast<T*>(slot) = (T)(sizeof(T) == 4 ? col[0].GetFloat() : col[0].GetDouble());
349
0
        return Status::OK();
350
0
    }
351
352
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
353
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
354
355
0
    StringParser::ParseResult result;
356
0
    const std::string& val = col.GetString();
357
0
    size_t len = col.GetStringLength();
358
0
    T v = StringParser::string_to_float<T>(val.c_str(), len, &result);
359
0
    RETURN_ERROR_IF_PARSING_FAILED(result, col, type);
360
0
    *reinterpret_cast<T*>(slot) = v;
361
362
0
    return Status::OK();
363
0
}
Unexecuted instantiation: _ZN5doris15get_float_valueIfEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
Unexecuted instantiation: _ZN5doris15get_float_valueIdEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPvb
364
365
template <typename T>
366
Status insert_float_value(const rapidjson::Value& col, PrimitiveType type, IColumn* col_ptr,
367
0
                          bool pure_doc_value, bool nullable) {
368
0
    static_assert(sizeof(T) == 4 || sizeof(T) == 8);
369
0
    if (col.IsNumber() && nullable) {
370
0
        T value = (T)(sizeof(T) == 4 ? col.GetFloat() : col.GetDouble());
371
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
372
0
        return Status::OK();
373
0
    }
374
375
0
    if (pure_doc_value && col.IsArray() && !col.Empty() && nullable) {
376
0
        T value = (T)(sizeof(T) == 4 ? col[0].GetFloat() : col[0].GetDouble());
377
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
378
0
        return Status::OK();
379
0
    }
380
381
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
382
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
383
384
0
    StringParser::ParseResult result;
385
0
    const std::string& val = col.GetString();
386
0
    size_t len = col.GetStringLength();
387
0
    T v = StringParser::string_to_float<T>(val.c_str(), len, &result);
388
0
    RETURN_ERROR_IF_PARSING_FAILED(result, col, type);
389
390
0
    col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&v)), 0);
391
392
0
    return Status::OK();
393
0
}
Unexecuted instantiation: _ZN5doris18insert_float_valueIdEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris18insert_float_valueIfEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
394
395
template <typename T>
396
Status insert_int_value(const rapidjson::Value& col, PrimitiveType type, IColumn* col_ptr,
397
0
                        bool pure_doc_value, bool nullable) {
398
0
    if (col.IsNumber()) {
399
0
        T value;
400
        // ES allows inserting float and double in int/long types.
401
        // To parse these numbers in Doris, we direct cast them to int types.
402
0
        if (col.IsDouble()) {
403
0
            value = static_cast<T>(col.GetDouble());
404
0
        } else if (col.IsFloat()) {
405
0
            value = static_cast<T>(col.GetFloat());
406
0
        } else {
407
0
            value = (T)(sizeof(T) < 8 ? col.GetInt() : col.GetInt64());
408
0
        }
409
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
410
0
        return Status::OK();
411
0
    }
412
413
0
    auto parse_and_insert_data = [&](const rapidjson::Value& col_value) -> Status {
414
0
        StringParser::ParseResult result;
415
0
        std::string val = col_value.GetString();
416
        // ES allows inserting numbers and characters containing decimals in numeric types.
417
        // To parse these numbers in Doris, we remove the decimals here.
418
0
        size_t pos = val.find('.');
419
0
        if (pos != std::string::npos) {
420
0
            val = val.substr(0, pos);
421
0
        }
422
0
        size_t len = val.length();
423
0
        T v = StringParser::string_to_int<T>(val.c_str(), len, &result);
424
0
        RETURN_ERROR_IF_PARSING_FAILED(result, col_value, type);
425
426
0
        col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&v)), 0);
427
0
        return Status::OK();
428
0
    };
Unexecuted instantiation: _ZZN5doris16insert_int_valueIaEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueIsEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueIiEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueIlEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
Unexecuted instantiation: _ZZN5doris16insert_int_valueInEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbbENKUlSB_E_clESB_
429
430
0
    if (pure_doc_value && col.IsArray() && !col.Empty()) {
431
0
        if (col[0].IsNumber()) {
432
0
            T value = (T)(sizeof(T) < 8 ? col[0].GetInt() : col[0].GetInt64());
433
0
            col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&value)), 0);
434
0
            return Status::OK();
435
0
        } else {
436
0
            RETURN_ERROR_IF_COL_IS_ARRAY(col[0], type, true);
437
0
            RETURN_ERROR_IF_COL_IS_NOT_STRING(col[0], type);
438
0
            return parse_and_insert_data(col[0]);
439
0
        }
440
0
    }
441
442
0
    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
443
0
    RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
444
0
    return parse_and_insert_data(col);
445
0
}
Unexecuted instantiation: _ZN5doris16insert_int_valueIaEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueIsEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueIiEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueIlEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
Unexecuted instantiation: _ZN5doris16insert_int_valueInEENS_6StatusERKN9rapidjson12GenericValueINS2_4UTF8IcEENS2_19MemoryPoolAllocatorINS2_12CrtAllocatorEEEEENS_13PrimitiveTypeEPNS_7IColumnEbb
446
447
template <PrimitiveType T>
448
Status handle_value(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
449
0
                    typename PrimitiveTypeTraits<T>::CppType& val) {
450
    if constexpr (T == TYPE_TINYINT || T == TYPE_SMALLINT || T == TYPE_INT || T == TYPE_BIGINT ||
451
0
                  T == TYPE_LARGEINT) {
452
0
        RETURN_IF_ERROR(get_int_value<typename PrimitiveTypeTraits<T>::CppType>(col, sub_type, &val,
453
0
                                                                                pure_doc_value));
454
0
        return Status::OK();
455
0
    }
456
0
    if constexpr (T == TYPE_FLOAT) {
457
0
        RETURN_IF_ERROR(get_float_value<float>(col, sub_type, &val, pure_doc_value));
458
0
        return Status::OK();
459
0
    }
460
0
    if constexpr (T == TYPE_DOUBLE) {
461
0
        RETURN_IF_ERROR(get_float_value<double>(col, sub_type, &val, pure_doc_value));
462
0
        return Status::OK();
463
0
    }
464
0
    if constexpr (T == TYPE_STRING || T == TYPE_CHAR || T == TYPE_VARCHAR) {
465
        // When ES mapping is keyword/text but actual data is an array,
466
        // serialize the array to JSON string instead of throwing an error.
467
        // This is valid in ES since any field can hold array values.
468
0
        if (col.IsArray()) {
469
0
            val = json_value_to_string(col);
470
0
        } else if (!col.IsString()) {
471
0
            val = json_value_to_string(col);
472
0
        } else {
473
0
            val = col.GetString();
474
0
        }
475
0
        return Status::OK();
476
0
    }
477
0
    if constexpr (T == TYPE_BOOLEAN) {
478
0
        if (col.IsBool()) {
479
0
            val = col.GetBool();
480
0
            return Status::OK();
481
0
        }
482
483
0
        if (col.IsNumber()) {
484
0
            val = static_cast<typename PrimitiveTypeTraits<T>::CppType>(col.GetInt());
485
0
            return Status::OK();
486
0
        }
487
488
0
        bool is_nested_str = false;
489
0
        if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsBool()) {
490
0
            val = col[0].GetBool();
491
0
            return Status::OK();
492
0
        } else if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsString()) {
493
0
            is_nested_str = true;
494
0
        } else if (pure_doc_value && col.IsArray()) {
495
0
            return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN");
496
0
        }
497
498
0
        const rapidjson::Value& str_col = is_nested_str ? col[0] : col;
499
0
        const std::string& str_val = str_col.GetString();
500
0
        size_t val_size = str_col.GetStringLength();
501
0
        StringParser::ParseResult result;
502
0
        val = StringParser::string_to_bool(str_val.c_str(), val_size, &result);
503
0
        RETURN_ERROR_IF_PARSING_FAILED(result, str_col, sub_type);
504
0
        return Status::OK();
505
0
    }
506
0
    throw Exception(ErrorCode::INTERNAL_ERROR, "Un-supported type: {}", type_to_string(T));
507
0
}
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12handle_valueILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
508
509
template <PrimitiveType T>
510
Status process_single_column(const rapidjson::Value& col, PrimitiveType sub_type,
511
0
                             bool pure_doc_value, Array& array) {
512
0
    typename PrimitiveTypeTraits<T>::CppType val;
513
0
    RETURN_IF_ERROR(handle_value<T>(col, sub_type, pure_doc_value, val));
514
0
    array.push_back(Field::create_field<T>(val));
515
0
    return Status::OK();
516
0
}
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris21process_single_columnILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
517
518
template <PrimitiveType T>
519
Status process_column_array(const rapidjson::Value& col, PrimitiveType sub_type,
520
0
                            bool pure_doc_value, Array& array) {
521
0
    for (const auto& sub_col : col.GetArray()) {
522
0
        RETURN_IF_ERROR(process_single_column<T>(sub_col, sub_type, pure_doc_value, array));
523
0
    }
524
0
    return Status::OK();
525
0
}
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris20process_column_arrayILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
526
527
template <PrimitiveType T>
528
Status process_column(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
529
0
                      Array& array) {
530
0
    if (!col.IsArray()) {
531
0
        return process_single_column<T>(col, sub_type, pure_doc_value, array);
532
0
    } else {
533
0
        return process_column_array<T>(col, sub_type, pure_doc_value, array);
534
0
    }
535
0
}
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE23EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE3EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE4EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE5EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE6EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE7EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE8EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE9EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
Unexecuted instantiation: _ZN5doris14process_columnILNS_13PrimitiveTypeE2EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayE
536
537
template <PrimitiveType T>
538
Status process_date_column(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
539
0
                           Array& array, const cctz::time_zone& time_zone) {
540
0
    if (!col.IsArray()) {
541
0
        typename PrimitiveTypeTraits<T>::CppType data;
542
0
        RETURN_IF_ERROR((get_date_int<T>(col, sub_type, pure_doc_value, &data, time_zone)));
543
0
        array.push_back(Field::create_field<T>(data));
544
0
    } else {
545
0
        for (const auto& sub_col : col.GetArray()) {
546
0
            typename PrimitiveTypeTraits<T>::CppType data;
547
0
            RETURN_IF_ERROR((get_date_int<T>(sub_col, sub_type, pure_doc_value, &data, time_zone)));
548
0
            array.push_back(Field::create_field<T>(data));
549
0
        }
550
0
    }
551
0
    return Status::OK();
552
0
}
Unexecuted instantiation: _ZN5doris19process_date_columnILNS_13PrimitiveTypeE25EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayERKN4cctz9time_zoneE
Unexecuted instantiation: _ZN5doris19process_date_columnILNS_13PrimitiveTypeE26EEENS_6StatusERKN9rapidjson12GenericValueINS3_4UTF8IcEENS3_19MemoryPoolAllocatorINS3_12CrtAllocatorEEEEES1_bRNS_5ArrayERKN4cctz9time_zoneE
553
554
Status process_jsonb_column(const rapidjson::Value& col, PrimitiveType sub_type,
555
0
                            bool pure_doc_value, Array& array) {
556
0
    if (!col.IsArray()) {
557
0
        JsonBinaryValue jsonb_value;
558
0
        RETURN_IF_ERROR(jsonb_value.from_json_string(json_value_to_string(col)));
559
0
        JsonbField json(jsonb_value.value(), jsonb_value.size());
560
0
        array.push_back(Field::create_field<TYPE_JSONB>(std::move(json)));
561
0
    } else {
562
0
        for (const auto& sub_col : col.GetArray()) {
563
0
            JsonBinaryValue jsonb_value;
564
0
            RETURN_IF_ERROR(jsonb_value.from_json_string(json_value_to_string(sub_col)));
565
0
            JsonbField json(jsonb_value.value(), jsonb_value.size());
566
0
            array.push_back(Field::create_field<TYPE_JSONB>(json));
567
0
        }
568
0
    }
569
0
    return Status::OK();
570
0
}
571
572
Status ScrollParser::parse_column(const rapidjson::Value& col, PrimitiveType sub_type,
573
                                  bool pure_doc_value, Array& array,
574
0
                                  const cctz::time_zone& time_zone) {
575
0
    switch (sub_type) {
576
0
    case TYPE_CHAR:
577
0
    case TYPE_VARCHAR:
578
0
    case TYPE_STRING:
579
0
        return process_column<TYPE_STRING>(col, sub_type, pure_doc_value, array);
580
0
    case TYPE_TINYINT:
581
0
        return process_column<TYPE_TINYINT>(col, sub_type, pure_doc_value, array);
582
0
    case TYPE_SMALLINT:
583
0
        return process_column<TYPE_SMALLINT>(col, sub_type, pure_doc_value, array);
584
0
    case TYPE_INT:
585
0
        return process_column<TYPE_INT>(col, sub_type, pure_doc_value, array);
586
0
    case TYPE_BIGINT:
587
0
        return process_column<TYPE_BIGINT>(col, sub_type, pure_doc_value, array);
588
0
    case TYPE_LARGEINT:
589
0
        return process_column<TYPE_LARGEINT>(col, sub_type, pure_doc_value, array);
590
0
    case TYPE_FLOAT:
591
0
        return process_column<TYPE_FLOAT>(col, sub_type, pure_doc_value, array);
592
0
    case TYPE_DOUBLE:
593
0
        return process_column<TYPE_DOUBLE>(col, sub_type, pure_doc_value, array);
594
0
    case TYPE_BOOLEAN:
595
0
        return process_column<TYPE_BOOLEAN>(col, sub_type, pure_doc_value, array);
596
    // date/datetime v2 is the default type for catalog table,
597
    // see https://github.com/apache/doris/pull/16304
598
    // No need to support date and datetime types.
599
0
    case TYPE_DATEV2: {
600
0
        return process_date_column<TYPE_DATEV2>(col, sub_type, pure_doc_value, array, time_zone);
601
0
    }
602
0
    case TYPE_DATETIMEV2: {
603
0
        return process_date_column<TYPE_DATETIMEV2>(col, sub_type, pure_doc_value, array,
604
0
                                                    time_zone);
605
0
    }
606
0
    case TYPE_JSONB: {
607
0
        return process_jsonb_column(col, sub_type, pure_doc_value, array);
608
0
    }
609
0
    default:
610
0
        LOG(ERROR) << "Do not support Array type: " << sub_type;
611
0
        return Status::InternalError("Unsupported type");
612
0
    }
613
0
}
614
615
0
ScrollParser::ScrollParser(bool doc_value_mode) : _size(0), _line_index(0) {}
616
617
0
ScrollParser::~ScrollParser() = default;
618
619
0
Status ScrollParser::parse(const std::string& scroll_result, bool exactly_once) {
620
    // rely on `_size !=0 ` to determine whether scroll ends
621
0
    _size = 0;
622
0
    _document_node.Parse(scroll_result.c_str(), scroll_result.length());
623
0
    if (_document_node.HasParseError()) {
624
0
        return Status::InternalError("Parsing json error, json is: {}", scroll_result);
625
0
    }
626
627
0
    if (!exactly_once && !_document_node.HasMember(FIELD_SCROLL_ID)) {
628
0
        LOG(WARNING) << "Document has not a scroll id field scroll response:" << scroll_result;
629
0
        return Status::InternalError("Document has not a scroll id field");
630
0
    }
631
632
0
    if (!exactly_once) {
633
0
        const rapidjson::Value& scroll_node = _document_node[FIELD_SCROLL_ID];
634
0
        _scroll_id = scroll_node.GetString();
635
0
    }
636
    // { hits: { total : 2, "hits" : [ {}, {}, {} ]}}
637
0
    const rapidjson::Value& outer_hits_node = _document_node[FIELD_HITS];
638
    // if has no inner hits, there has no data in this index
639
0
    if (!outer_hits_node.HasMember(FIELD_INNER_HITS)) {
640
0
        return Status::OK();
641
0
    }
642
0
    const rapidjson::Value& inner_hits_node = outer_hits_node[FIELD_INNER_HITS];
643
    // this happened just the end of scrolling
644
0
    if (!inner_hits_node.IsArray()) {
645
0
        return Status::OK();
646
0
    }
647
0
    _inner_hits_node.CopyFrom(inner_hits_node, _document_node.GetAllocator());
648
    // how many documents contains in this batch
649
0
    _size = _inner_hits_node.Size();
650
0
    return Status::OK();
651
0
}
652
653
0
int ScrollParser::get_size() const {
654
0
    return _size;
655
0
}
656
657
0
const std::string& ScrollParser::get_scroll_id() {
658
0
    return _scroll_id;
659
0
}
660
661
Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc,
662
                                  std::vector<MutableColumnPtr>& columns, bool* line_eof,
663
                                  const std::map<std::string, std::string>& docvalue_context,
664
0
                                  const cctz::time_zone& time_zone) {
665
0
    *line_eof = true;
666
667
0
    if (_size <= 0 || _line_index >= _size) {
668
0
        return Status::OK();
669
0
    }
670
671
0
    const rapidjson::Value& obj = _inner_hits_node[_line_index++];
672
0
    bool pure_doc_value = false;
673
0
    if (obj.HasMember("fields")) {
674
0
        pure_doc_value = true;
675
0
    }
676
    // obj may be neither have `_source` nor `fields` field.
677
0
    const rapidjson::Value* line = nullptr;
678
0
    if (obj.HasMember(FIELD_SOURCE)) {
679
0
        line = &obj[FIELD_SOURCE];
680
0
    } else if (obj.HasMember("fields")) {
681
0
        line = &obj["fields"];
682
0
    }
683
684
0
    for (int i = 0; i < tuple_desc->slots().size(); ++i) {
685
0
        const SlotDescriptor* slot_desc = tuple_desc->slots()[i];
686
0
        auto* col_ptr = columns[i].get();
687
688
0
        if (slot_desc->col_name() == FIELD_ID) {
689
            // actually this branch will not be reached, this is guaranteed by Doris FE.
690
0
            if (pure_doc_value) {
691
0
                return Status::RuntimeError("obtain `_id` is not supported in doc_values mode");
692
0
            }
693
            // obj[FIELD_ID] must not be NULL
694
0
            std::string _id = obj[FIELD_ID].GetString();
695
0
            size_t len = _id.length();
696
697
0
            col_ptr->insert_data(const_cast<const char*>(_id.data()), len);
698
0
            continue;
699
0
        }
700
701
0
        const char* col_name = pure_doc_value ? docvalue_context.at(slot_desc->col_name()).c_str()
702
0
                                              : slot_desc->col_name().c_str();
703
704
0
        if (line == nullptr || line->FindMember(col_name) == line->MemberEnd()) {
705
0
            if (slot_desc->is_nullable()) {
706
0
                auto* nullable_column = reinterpret_cast<ColumnNullable*>(col_ptr);
707
0
                nullable_column->insert_data(nullptr, 0);
708
0
                continue;
709
0
            } else {
710
0
                std::string details = absl::Substitute(INVALID_NULL_VALUE, col_name);
711
0
                return Status::RuntimeError(details);
712
0
            }
713
0
        }
714
715
0
        const rapidjson::Value& col = (*line)[col_name];
716
717
0
        auto type = slot_desc->type()->get_primitive_type();
718
719
        // when the column value is null, the subsequent type casting will report an error
720
0
        if (col.IsNull() && slot_desc->is_nullable()) {
721
0
            col_ptr->insert_data(nullptr, 0);
722
0
            continue;
723
0
        } else if (col.IsNull() && !slot_desc->is_nullable()) {
724
0
            std::string details = absl::Substitute(INVALID_NULL_VALUE, col_name);
725
0
            return Status::RuntimeError(details);
726
0
        }
727
0
        switch (type) {
728
0
        case TYPE_CHAR:
729
0
        case TYPE_VARCHAR:
730
0
        case TYPE_STRING: {
731
            // sometimes elasticsearch user post some not-string value to Elasticsearch Index.
732
            // because of reading value from _source, we can not process all json type and then just transfer the value to original string representation
733
            // this may be a tricky, but we can work around this issue
734
0
            std::string val;
735
0
            if (pure_doc_value) {
736
0
                if (col.Empty()) {
737
0
                    break;
738
0
                } else if (col.Size() > 1) {
739
                    // doc_values with multiple elements means actual array data
740
                    // in ES keyword/text field, serialize as JSON array string
741
0
                    val = json_value_to_string(col);
742
0
                } else if (!col[0].IsString()) {
743
0
                    val = json_value_to_string(col[0]);
744
0
                } else {
745
0
                    val = col[0].GetString();
746
0
                }
747
0
            } else {
748
                // When ES mapping is keyword/text but actual data is an array,
749
                // serialize the array to JSON string instead of throwing an error.
750
                // This is valid in ES since any field can hold array values.
751
0
                if (col.IsArray()) {
752
0
                    val = json_value_to_string(col);
753
0
                } else if (!col.IsString()) {
754
0
                    val = json_value_to_string(col);
755
0
                } else {
756
0
                    val = col.GetString();
757
0
                }
758
0
            }
759
0
            size_t val_size = val.length();
760
0
            col_ptr->insert_data(const_cast<const char*>(val.data()), val_size);
761
0
            break;
762
0
        }
763
764
0
        case TYPE_TINYINT: {
765
0
            RETURN_IF_ERROR(insert_int_value<int8_t>(col, type, col_ptr, pure_doc_value,
766
0
                                                     slot_desc->is_nullable()));
767
0
            break;
768
0
        }
769
770
0
        case TYPE_SMALLINT: {
771
0
            RETURN_IF_ERROR(insert_int_value<int16_t>(col, type, col_ptr, pure_doc_value,
772
0
                                                      slot_desc->is_nullable()));
773
0
            break;
774
0
        }
775
776
0
        case TYPE_INT: {
777
0
            RETURN_IF_ERROR(insert_int_value<int32_t>(col, type, col_ptr, pure_doc_value,
778
0
                                                      slot_desc->is_nullable()));
779
0
            break;
780
0
        }
781
782
0
        case TYPE_BIGINT: {
783
0
            RETURN_IF_ERROR(insert_int_value<int64_t>(col, type, col_ptr, pure_doc_value,
784
0
                                                      slot_desc->is_nullable()));
785
0
            break;
786
0
        }
787
788
0
        case TYPE_LARGEINT: {
789
0
            RETURN_IF_ERROR(insert_int_value<__int128>(col, type, col_ptr, pure_doc_value,
790
0
                                                       slot_desc->is_nullable()));
791
0
            break;
792
0
        }
793
794
0
        case TYPE_DOUBLE: {
795
0
            RETURN_IF_ERROR(insert_float_value<double>(col, type, col_ptr, pure_doc_value,
796
0
                                                       slot_desc->is_nullable()));
797
0
            break;
798
0
        }
799
800
0
        case TYPE_FLOAT: {
801
0
            RETURN_IF_ERROR(insert_float_value<float>(col, type, col_ptr, pure_doc_value,
802
0
                                                      slot_desc->is_nullable()));
803
0
            break;
804
0
        }
805
806
0
        case TYPE_BOOLEAN: {
807
0
            if (col.IsBool()) {
808
0
                int8_t val = col.GetBool();
809
0
                col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
810
0
                break;
811
0
            }
812
813
0
            if (col.IsNumber()) {
814
0
                int8_t val = static_cast<int8_t>(col.GetInt());
815
0
                col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
816
0
                break;
817
0
            }
818
819
0
            bool is_nested_str = false;
820
0
            if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsBool()) {
821
0
                int8_t val = col[0].GetBool();
822
0
                col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
823
0
                break;
824
0
            } else if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsString()) {
825
0
                is_nested_str = true;
826
0
            } else if (pure_doc_value && col.IsArray()) {
827
0
                return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN");
828
0
            }
829
830
0
            const rapidjson::Value& str_col = is_nested_str ? col[0] : col;
831
832
0
            RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
833
834
0
            const std::string& val = str_col.GetString();
835
0
            size_t val_size = str_col.GetStringLength();
836
0
            StringParser::ParseResult result;
837
0
            bool b = StringParser::string_to_bool(val.c_str(), val_size, &result);
838
0
            RETURN_ERROR_IF_PARSING_FAILED(result, str_col, type);
839
0
            col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&b)), 0);
840
0
            break;
841
0
        }
842
0
        case TYPE_DECIMALV2: {
843
0
            DecimalV2Value data;
844
845
0
            if (col.IsDouble()) {
846
0
                data.assign_from_double(col.GetDouble());
847
0
            } else {
848
0
                std::string val;
849
0
                if (pure_doc_value) {
850
0
                    if (col.Empty()) {
851
0
                        break;
852
0
                    } else if (!col[0].IsString()) {
853
0
                        val = json_value_to_string(col[0]);
854
0
                    } else {
855
0
                        val = col[0].GetString();
856
0
                    }
857
0
                } else {
858
0
                    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
859
0
                    if (!col.IsString()) {
860
0
                        val = json_value_to_string(col);
861
0
                    } else {
862
0
                        val = col.GetString();
863
0
                    }
864
0
                }
865
0
                data.parse_from_str(val.data(), static_cast<int32_t>(val.length()));
866
0
            }
867
0
            col_ptr->insert_data(const_cast<const char*>(reinterpret_cast<char*>(&data)), 0);
868
0
            break;
869
0
        }
870
871
0
        case TYPE_DATE:
872
0
            RETURN_IF_ERROR(
873
0
                    fill_date_int<TYPE_DATE>(col, type, pure_doc_value, col_ptr, time_zone));
874
0
            break;
875
0
        case TYPE_DATETIME:
876
0
            RETURN_IF_ERROR(
877
0
                    fill_date_int<TYPE_DATETIME>(col, type, pure_doc_value, col_ptr, time_zone));
878
0
            break;
879
0
        case TYPE_DATEV2:
880
0
            RETURN_IF_ERROR(
881
0
                    fill_date_int<TYPE_DATEV2>(col, type, pure_doc_value, col_ptr, time_zone));
882
0
            break;
883
0
        case TYPE_DATETIMEV2: {
884
0
            RETURN_IF_ERROR(
885
0
                    fill_date_int<TYPE_DATETIMEV2>(col, type, pure_doc_value, col_ptr, time_zone));
886
0
            break;
887
0
        }
888
0
        case TYPE_ARRAY: {
889
0
            Array array;
890
0
            const auto& sub_type = assert_cast<const DataTypeArray*>(
891
0
                                           remove_nullable(tuple_desc->slots()[i]->type()).get())
892
0
                                           ->get_nested_type()
893
0
                                           ->get_primitive_type();
894
0
            RETURN_IF_ERROR(parse_column(col, sub_type, pure_doc_value, array, time_zone));
895
0
            col_ptr->insert(Field::create_field<TYPE_ARRAY>(array));
896
0
            break;
897
0
        }
898
0
        case TYPE_JSONB: {
899
0
            JsonBinaryValue jsonb_value;
900
0
            RETURN_IF_ERROR(jsonb_value.from_json_string(json_value_to_string(col)));
901
0
            JsonbField json(jsonb_value.value(), jsonb_value.size());
902
0
            col_ptr->insert(Field::create_field<TYPE_JSONB>(json));
903
0
            break;
904
0
        }
905
0
        default: {
906
0
            LOG(ERROR) << "Unsupported data type: " << type_to_string(type);
907
0
            DCHECK(false);
908
0
            break;
909
0
        }
910
0
        }
911
0
    }
912
913
0
    *line_eof = false;
914
0
    return Status::OK();
915
0
}
916
} // namespace doris