Coverage Report

Created: 2026-05-19 14:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/string_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/string_parser.hpp"
19
20
#include <limits>
21
22
#include "core/extended_types.h"
23
#include "core/types.h"
24
namespace doris {
25
#include "common/compile_check_avoid_begin.h"
26
// Supported decimal number format:
27
// <decimal> ::= <whitespace>* <value> <whitespace>*
28
//
29
// <whitespace> ::= " " | "\t" | "\n" | "\r" | "\f" | "\v"
30
//
31
// <value> ::= <sign>? <significand> <exponent>?
32
//
33
// <sign> ::= "+" | "-"
34
//
35
// <significand> ::= <digits> "." <digits> | <digits> | <digits> "." | "." <digits>
36
//
37
// <digits> ::= <digit>+
38
//
39
// <digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
40
//
41
// <exponent> ::= <e_marker> <sign>? <digits>
42
//
43
// <e_marker> ::= "e" | "E"
44
//
45
// Parsing algorithm:
46
// 1. Trim spaces and the sign, then normalize the significand by skipping leading zeros and an
47
//    optional leading dot. During this scan, count digits that belong to the original integral
48
//    part (`int_part_count`) and remember where the significand ends (`end_digit_index`).
49
// 2. Parse the optional exponent. Scientific notation is handled by moving the decimal point:
50
//    `result_int_part_digit_count = int_part_count + exponent`. For example, "12.34e-1" has
51
//    int_part_count=2 and exponent=-1, so the result has one integral digit: "1.234".
52
// 3. Build the result in scaled-integer form: first collect the integral digits up to the shifted
53
//    decimal point, then collect up to `type_scale` fractional digits, padding with zeros when the
54
//    input has fewer fractional digits than the target scale.
55
// 4. If there are extra fractional digits, round half up using the first discarded digit. Finally,
56
//    check the integral digit count against `type_precision - type_scale` and return the signed
57
//    scaled integer value.
58
template <PrimitiveType P>
59
typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_decimal(
60
        const char* __restrict s, size_t len, int type_precision, int type_scale,
61
17.7M
        ParseResult* result) {
62
17.7M
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
17.7M
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
17.7M
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
17.7M
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
17.7M
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
17.7M
    s = skip_ascii_whitespaces(s, len);
79
80
17.7M
    bool is_negative = false;
81
17.7M
    if (len > 0) {
82
17.6M
        switch (*s) {
83
198k
        case '-':
84
198k
            is_negative = true;
85
198k
            [[fallthrough]];
86
225k
        case '+':
87
225k
            ++s;
88
225k
            --len;
89
17.6M
        }
90
17.6M
    }
91
    // Ignore leading zeros.
92
17.7M
    bool found_value = false;
93
25.2M
    while (len > 0 && UNLIKELY(*s == '0')) {
94
7.55M
        found_value = true;
95
7.55M
        ++s;
96
7.55M
        --len;
97
7.55M
    }
98
99
17.7M
    int found_dot = 0;
100
17.7M
    if (len > 0 && *s == '.') {
101
7.29M
        found_dot = 1;
102
7.29M
        ++s;
103
7.29M
        --len;
104
7.29M
    }
105
17.7M
    int int_part_count = 0;
106
17.7M
    int i = 0;
107
114M
    for (; i != len; ++i) {
108
97.2M
        const char& c = s[i];
109
97.2M
        if (LIKELY('0' <= c && c <= '9')) {
110
90.7M
            found_value = true;
111
90.7M
            if (!found_dot) {
112
55.0M
                ++int_part_count;
113
55.0M
            }
114
90.7M
        } else if (c == '.') {
115
6.39M
            if (found_dot) {
116
2
                *result = StringParser::PARSE_FAILURE;
117
2
                return 0;
118
2
            }
119
6.39M
            found_dot = 1;
120
6.39M
        } else {
121
105k
            break;
122
105k
        }
123
97.2M
    }
124
17.7M
    if (!found_value) {
125
        // '', '.'
126
98.0k
        *result = StringParser::PARSE_FAILURE;
127
98.0k
        return 0;
128
98.0k
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
17.6M
    int64_t exponent = 0;
133
17.6M
    auto end_digit_index = i;
134
17.6M
    if (i != len) {
135
113k
        bool negative_exponent = false;
136
113k
        if (s[i] == 'e' || s[i] == 'E') {
137
113k
            ++i;
138
113k
            if (i != len) {
139
113k
                switch (s[i]) {
140
15.6k
                case '-':
141
15.6k
                    negative_exponent = true;
142
15.6k
                    [[fallthrough]];
143
82.2k
                case '+':
144
82.2k
                    ++i;
145
113k
                }
146
113k
            }
147
113k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
6
                *result = StringParser::PARSE_FAILURE;
150
6
                return 0;
151
6
            }
152
335k
            for (; i != len; ++i) {
153
222k
                const char& c = s[i];
154
222k
                if (LIKELY('0' <= c && c <= '9')) {
155
222k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
222k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
222k
                } else {
165
                    // '123e12abc', '123e1.2'
166
22
                    *result = StringParser::PARSE_FAILURE;
167
22
                    return 0;
168
22
                }
169
222k
            }
170
113k
            if (negative_exponent) {
171
15.6k
                exponent = -exponent;
172
15.6k
            }
173
113k
        } else {
174
206
            *result = StringParser::PARSE_FAILURE;
175
206
            return 0;
176
206
        }
177
113k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
17.6M
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
17.6M
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
17.6M
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
17.6M
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
17.6M
    T int_part_number = 0;
191
17.6M
    T frac_part_number = 0;
192
17.6M
    int actual_frac_part_count = 0;
193
17.6M
    int digit_index = 0;
194
17.6M
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
17.6M
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
13.6M
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
17.6M
                                           : result_int_part_digit_count,
202
17.6M
                                 end_digit_index);
203
17.6M
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
18.4M
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
858k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
17.6M
        if (digit_index != max_index &&
210
17.6M
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
17.0k
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
17.0k
            return 0;
213
17.0k
        }
214
        // get int part number
215
72.9M
        for (; digit_index != max_index; ++digit_index) {
216
55.3M
            if (UNLIKELY(s[digit_index] == '.')) {
217
71.1k
                continue;
218
71.1k
            }
219
55.2M
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
55.2M
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
17.5M
        auto total_significant_digit_count =
225
17.5M
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
17.5M
        if (result_int_part_digit_count > total_significant_digit_count) {
227
64.8k
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
64.8k
                                                       total_significant_digit_count);
229
64.8k
        }
230
17.5M
    } else {
231
        // leading zeros of fraction part
232
21.0k
        actual_frac_part_count = -result_int_part_digit_count;
233
21.0k
    }
234
    // get fraction part number
235
57.3M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
39.6M
        if (UNLIKELY(s[digit_index] == '.')) {
237
6.09M
            continue;
238
6.09M
        }
239
33.5M
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
33.5M
        ++actual_frac_part_count;
241
33.5M
    }
242
17.6M
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
17.6M
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
285k
        if (UNLIKELY(s[digit_index] == '.')) {
249
203k
            ++digit_index;
250
203k
        }
251
285k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
284k
            if (s[digit_index] >= '5') {
254
200k
                ++frac_part_number;
255
200k
                if (frac_part_number == type_scale_multiplier) {
256
168k
                    frac_part_number = 0;
257
168k
                    ++int_part_number;
258
168k
                }
259
200k
            }
260
284k
        }
261
17.3M
    } else {
262
17.3M
        if (actual_frac_part_count < type_scale) {
263
4.11M
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
4.11M
        }
265
17.3M
    }
266
17.6M
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
152
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
152
        return 0;
269
152
    }
270
271
17.6M
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
17.6M
    *result = StringParser::PARSE_SUCCESS;
273
17.6M
    return is_negative ? T(-value) : T(value);
274
17.6M
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE28EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
321k
        ParseResult* result) {
62
321k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
321k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
321k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
321k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
321k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
321k
    s = skip_ascii_whitespaces(s, len);
79
80
321k
    bool is_negative = false;
81
321k
    if (len > 0) {
82
316k
        switch (*s) {
83
60.1k
        case '-':
84
60.1k
            is_negative = true;
85
60.1k
            [[fallthrough]];
86
67.4k
        case '+':
87
67.4k
            ++s;
88
67.4k
            --len;
89
316k
        }
90
316k
    }
91
    // Ignore leading zeros.
92
321k
    bool found_value = false;
93
378k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
56.7k
        found_value = true;
95
56.7k
        ++s;
96
56.7k
        --len;
97
56.7k
    }
98
99
321k
    int found_dot = 0;
100
321k
    if (len > 0 && *s == '.') {
101
19.6k
        found_dot = 1;
102
19.6k
        ++s;
103
19.6k
        --len;
104
19.6k
    }
105
321k
    int int_part_count = 0;
106
321k
    int i = 0;
107
2.16M
    for (; i != len; ++i) {
108
1.85M
        const char& c = s[i];
109
1.85M
        if (LIKELY('0' <= c && c <= '9')) {
110
1.58M
            found_value = true;
111
1.58M
            if (!found_dot) {
112
915k
                ++int_part_count;
113
915k
            }
114
1.58M
        } else if (c == '.') {
115
255k
            if (found_dot) {
116
2
                *result = StringParser::PARSE_FAILURE;
117
2
                return 0;
118
2
            }
119
255k
            found_dot = 1;
120
255k
        } else {
121
10.9k
            break;
122
10.9k
        }
123
1.85M
    }
124
321k
    if (!found_value) {
125
        // '', '.'
126
6.97k
        *result = StringParser::PARSE_FAILURE;
127
6.97k
        return 0;
128
6.97k
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
314k
    int64_t exponent = 0;
133
314k
    auto end_digit_index = i;
134
314k
    if (i != len) {
135
9.42k
        bool negative_exponent = false;
136
9.42k
        if (s[i] == 'e' || s[i] == 'E') {
137
9.33k
            ++i;
138
9.33k
            if (i != len) {
139
9.33k
                switch (s[i]) {
140
1.54k
                case '-':
141
1.54k
                    negative_exponent = true;
142
1.54k
                    [[fallthrough]];
143
1.54k
                case '+':
144
1.54k
                    ++i;
145
9.33k
                }
146
9.33k
            }
147
9.33k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
6
                *result = StringParser::PARSE_FAILURE;
150
6
                return 0;
151
6
            }
152
24.6k
            for (; i != len; ++i) {
153
15.3k
                const char& c = s[i];
154
15.3k
                if (LIKELY('0' <= c && c <= '9')) {
155
15.3k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
15.3k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
15.3k
                } else {
165
                    // '123e12abc', '123e1.2'
166
12
                    *result = StringParser::PARSE_FAILURE;
167
12
                    return 0;
168
12
                }
169
15.3k
            }
170
9.31k
            if (negative_exponent) {
171
1.53k
                exponent = -exponent;
172
1.53k
            }
173
9.31k
        } else {
174
90
            *result = StringParser::PARSE_FAILURE;
175
90
            return 0;
176
90
        }
177
9.42k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
314k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
314k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
314k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
314k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
314k
    T int_part_number = 0;
191
314k
    T frac_part_number = 0;
192
314k
    int actual_frac_part_count = 0;
193
314k
    int digit_index = 0;
194
314k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
314k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
274k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
314k
                                           : result_int_part_digit_count,
202
314k
                                 end_digit_index);
203
314k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
527k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
212k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
314k
        if (digit_index != max_index &&
210
314k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
6.03k
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
6.03k
            return 0;
213
6.03k
        }
214
        // get int part number
215
1.12M
        for (; digit_index != max_index; ++digit_index) {
216
816k
            if (UNLIKELY(s[digit_index] == '.')) {
217
1.60k
                continue;
218
1.60k
            }
219
814k
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
814k
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
308k
        auto total_significant_digit_count =
225
308k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
308k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
100
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
100
                                                       total_significant_digit_count);
229
100
        }
230
308k
    } else {
231
        // leading zeros of fraction part
232
26
        actual_frac_part_count = -result_int_part_digit_count;
233
26
    }
234
    // get fraction part number
235
576k
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
267k
        if (UNLIKELY(s[digit_index] == '.')) {
237
46.2k
            continue;
238
46.2k
        }
239
221k
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
221k
        ++actual_frac_part_count;
241
221k
    }
242
308k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
308k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
221k
        if (UNLIKELY(s[digit_index] == '.')) {
249
200k
            ++digit_index;
250
200k
        }
251
221k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
220k
            if (s[digit_index] >= '5') {
254
174k
                ++frac_part_number;
255
174k
                if (frac_part_number == type_scale_multiplier) {
256
165k
                    frac_part_number = 0;
257
165k
                    ++int_part_number;
258
165k
                }
259
174k
            }
260
220k
        }
261
221k
    } else {
262
87.2k
        if (actual_frac_part_count < type_scale) {
263
30.1k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
30.1k
        }
265
87.2k
    }
266
308k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
24
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
24
        return 0;
269
24
    }
270
271
308k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
308k
    *result = StringParser::PARSE_SUCCESS;
273
308k
    return is_negative ? T(-value) : T(value);
274
308k
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE29EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
15.1M
        ParseResult* result) {
62
15.1M
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
15.1M
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
15.1M
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
15.1M
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
15.1M
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
15.1M
    s = skip_ascii_whitespaces(s, len);
79
80
15.1M
    bool is_negative = false;
81
15.1M
    if (len > 0) {
82
15.1M
        switch (*s) {
83
31.0k
        case '-':
84
31.0k
            is_negative = true;
85
31.0k
            [[fallthrough]];
86
37.7k
        case '+':
87
37.7k
            ++s;
88
37.7k
            --len;
89
15.1M
        }
90
15.1M
    }
91
    // Ignore leading zeros.
92
15.1M
    bool found_value = false;
93
22.4M
    while (len > 0 && UNLIKELY(*s == '0')) {
94
7.28M
        found_value = true;
95
7.28M
        ++s;
96
7.28M
        --len;
97
7.28M
    }
98
99
15.1M
    int found_dot = 0;
100
15.1M
    if (len > 0 && *s == '.') {
101
7.22M
        found_dot = 1;
102
7.22M
        ++s;
103
7.22M
        --len;
104
7.22M
    }
105
15.1M
    int int_part_count = 0;
106
15.1M
    int i = 0;
107
70.6M
    for (; i != len; ++i) {
108
55.5M
        const char& c = s[i];
109
55.5M
        if (LIKELY('0' <= c && c <= '9')) {
110
51.2M
            found_value = true;
111
51.2M
            if (!found_dot) {
112
27.7M
                ++int_part_count;
113
27.7M
            }
114
51.2M
        } else if (c == '.') {
115
4.26M
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
4.26M
            found_dot = 1;
120
4.26M
        } else {
121
6.48k
            break;
122
6.48k
        }
123
55.5M
    }
124
15.1M
    if (!found_value) {
125
        // '', '.'
126
379
        *result = StringParser::PARSE_FAILURE;
127
379
        return 0;
128
379
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
15.1M
    int64_t exponent = 0;
133
15.1M
    auto end_digit_index = i;
134
15.1M
    if (i != len) {
135
13.7k
        bool negative_exponent = false;
136
13.7k
        if (s[i] == 'e' || s[i] == 'E') {
137
13.6k
            ++i;
138
13.6k
            if (i != len) {
139
13.6k
                switch (s[i]) {
140
5.89k
                case '-':
141
5.89k
                    negative_exponent = true;
142
5.89k
                    [[fallthrough]];
143
5.89k
                case '+':
144
5.89k
                    ++i;
145
13.6k
                }
146
13.6k
            }
147
13.6k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
40.7k
            for (; i != len; ++i) {
153
27.0k
                const char& c = s[i];
154
27.0k
                if (LIKELY('0' <= c && c <= '9')) {
155
27.0k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
27.0k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
27.0k
                } else {
165
                    // '123e12abc', '123e1.2'
166
0
                    *result = StringParser::PARSE_FAILURE;
167
0
                    return 0;
168
0
                }
169
27.0k
            }
170
13.6k
            if (negative_exponent) {
171
5.89k
                exponent = -exponent;
172
5.89k
            }
173
13.6k
        } else {
174
81
            *result = StringParser::PARSE_FAILURE;
175
81
            return 0;
176
81
        }
177
13.7k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
15.1M
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
15.1M
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
15.1M
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
15.1M
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
15.1M
    T int_part_number = 0;
191
15.1M
    T frac_part_number = 0;
192
15.1M
    int actual_frac_part_count = 0;
193
15.1M
    int digit_index = 0;
194
15.1M
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
15.1M
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
11.4M
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
15.1M
                                           : result_int_part_digit_count,
202
15.1M
                                 end_digit_index);
203
15.1M
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
15.3M
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
214k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
15.1M
        if (digit_index != max_index &&
210
15.1M
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
10.5k
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
10.5k
            return 0;
213
10.5k
        }
214
        // get int part number
215
42.4M
        for (; digit_index != max_index; ++digit_index) {
216
27.3M
            if (UNLIKELY(s[digit_index] == '.')) {
217
960
                continue;
218
960
            }
219
27.3M
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
27.3M
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
15.1M
        auto total_significant_digit_count =
225
15.1M
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
15.1M
        if (result_int_part_digit_count > total_significant_digit_count) {
227
92
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
92
                                                       total_significant_digit_count);
229
92
        }
230
15.1M
    } else {
231
        // leading zeros of fraction part
232
5.53k
        actual_frac_part_count = -result_int_part_digit_count;
233
5.53k
    }
234
    // get fraction part number
235
42.7M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
27.6M
        if (UNLIKELY(s[digit_index] == '.')) {
237
4.25M
            continue;
238
4.25M
        }
239
23.3M
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
23.3M
        ++actual_frac_part_count;
241
23.3M
    }
242
15.1M
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
15.1M
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
22.4k
        if (UNLIKELY(s[digit_index] == '.')) {
249
869
            ++digit_index;
250
869
        }
251
22.4k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
22.2k
            if (s[digit_index] >= '5') {
254
8.99k
                ++frac_part_number;
255
8.99k
                if (frac_part_number == type_scale_multiplier) {
256
988
                    frac_part_number = 0;
257
988
                    ++int_part_number;
258
988
                }
259
8.99k
            }
260
22.2k
        }
261
15.0M
    } else {
262
15.0M
        if (actual_frac_part_count < type_scale) {
263
3.63M
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
3.63M
        }
265
15.0M
    }
266
15.1M
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
56
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
56
        return 0;
269
56
    }
270
271
15.1M
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
15.1M
    *result = StringParser::PARSE_SUCCESS;
273
15.1M
    return is_negative ? T(-value) : T(value);
274
15.1M
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE30EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
2.12M
        ParseResult* result) {
62
2.12M
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
2.12M
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
2.12M
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
2.12M
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
2.12M
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
2.12M
    s = skip_ascii_whitespaces(s, len);
79
80
2.12M
    bool is_negative = false;
81
2.12M
    if (len > 0) {
82
2.03M
        switch (*s) {
83
79.9k
        case '-':
84
79.9k
            is_negative = true;
85
79.9k
            [[fallthrough]];
86
86.6k
        case '+':
87
86.6k
            ++s;
88
86.6k
            --len;
89
2.03M
        }
90
2.03M
    }
91
    // Ignore leading zeros.
92
2.12M
    bool found_value = false;
93
2.20M
    while (len > 0 && UNLIKELY(*s == '0')) {
94
80.5k
        found_value = true;
95
80.5k
        ++s;
96
80.5k
        --len;
97
80.5k
    }
98
99
2.12M
    int found_dot = 0;
100
2.12M
    if (len > 0 && *s == '.') {
101
29.1k
        found_dot = 1;
102
29.1k
        ++s;
103
29.1k
        --len;
104
29.1k
    }
105
2.12M
    int int_part_count = 0;
106
2.12M
    int i = 0;
107
36.7M
    for (; i != len; ++i) {
108
34.6M
        const char& c = s[i];
109
34.6M
        if (LIKELY('0' <= c && c <= '9')) {
110
32.8M
            found_value = true;
111
32.8M
            if (!found_dot) {
112
24.4M
                ++int_part_count;
113
24.4M
            }
114
32.8M
        } else if (c == '.') {
115
1.75M
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
1.75M
            found_dot = 1;
120
1.75M
        } else {
121
10.0k
            break;
122
10.0k
        }
123
34.6M
    }
124
2.12M
    if (!found_value) {
125
        // '', '.'
126
90.6k
        *result = StringParser::PARSE_FAILURE;
127
90.6k
        return 0;
128
90.6k
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
2.03M
    int64_t exponent = 0;
133
2.03M
    auto end_digit_index = i;
134
2.03M
    if (i != len) {
135
12.4k
        bool negative_exponent = false;
136
12.4k
        if (s[i] == 'e' || s[i] == 'E') {
137
12.3k
            ++i;
138
12.3k
            if (i != len) {
139
12.3k
                switch (s[i]) {
140
4.62k
                case '-':
141
4.62k
                    negative_exponent = true;
142
4.62k
                    [[fallthrough]];
143
4.62k
                case '+':
144
4.62k
                    ++i;
145
12.3k
                }
146
12.3k
            }
147
12.3k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
35.7k
            for (; i != len; ++i) {
153
23.3k
                const char& c = s[i];
154
23.3k
                if (LIKELY('0' <= c && c <= '9')) {
155
23.3k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
23.3k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
23.3k
                } else {
165
                    // '123e12abc', '123e1.2'
166
0
                    *result = StringParser::PARSE_FAILURE;
167
0
                    return 0;
168
0
                }
169
23.3k
            }
170
12.3k
            if (negative_exponent) {
171
4.62k
                exponent = -exponent;
172
4.62k
            }
173
12.3k
        } else {
174
20
            *result = StringParser::PARSE_FAILURE;
175
20
            return 0;
176
20
        }
177
12.4k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
2.03M
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
2.03M
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
2.03M
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
2.03M
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
2.03M
    T int_part_number = 0;
191
2.03M
    T frac_part_number = 0;
192
2.03M
    int actual_frac_part_count = 0;
193
2.03M
    int digit_index = 0;
194
2.03M
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
2.02M
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
1.78M
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
2.02M
                                           : result_int_part_digit_count,
202
2.02M
                                 end_digit_index);
203
2.02M
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
2.23M
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
213k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
2.02M
        if (digit_index != max_index &&
210
2.02M
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
143
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
143
            return 0;
213
143
        }
214
        // get int part number
215
26.5M
        for (; digit_index != max_index; ++digit_index) {
216
24.4M
            if (UNLIKELY(s[digit_index] == '.')) {
217
962
                continue;
218
962
            }
219
24.4M
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
24.4M
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
2.02M
        auto total_significant_digit_count =
225
2.02M
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
2.02M
        if (result_int_part_digit_count > total_significant_digit_count) {
227
80
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
80
                                                       total_significant_digit_count);
229
80
        }
230
2.02M
    } else {
231
        // leading zeros of fraction part
232
15.3k
        actual_frac_part_count = -result_int_part_digit_count;
233
15.3k
    }
234
    // get fraction part number
235
11.9M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
9.86M
        if (UNLIKELY(s[digit_index] == '.')) {
237
1.75M
            continue;
238
1.75M
        }
239
8.11M
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
8.11M
        ++actual_frac_part_count;
241
8.11M
    }
242
2.03M
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
2.03M
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
20.1k
        if (UNLIKELY(s[digit_index] == '.')) {
249
852
            ++digit_index;
250
852
        }
251
20.1k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
19.9k
            if (s[digit_index] >= '5') {
254
8.03k
                ++frac_part_number;
255
8.03k
                if (frac_part_number == type_scale_multiplier) {
256
906
                    frac_part_number = 0;
257
906
                    ++int_part_number;
258
906
                }
259
8.03k
            }
260
19.9k
        }
261
2.01M
    } else {
262
2.01M
        if (actual_frac_part_count < type_scale) {
263
351k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
351k
        }
265
2.01M
    }
266
2.03M
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
16
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
16
        return 0;
269
16
    }
270
271
2.03M
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
2.03M
    *result = StringParser::PARSE_SUCCESS;
273
2.03M
    return is_negative ? T(-value) : T(value);
274
2.03M
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE20EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
14.2k
        ParseResult* result) {
62
14.2k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
14.2k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
14.2k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
14.2k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
14.2k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
14.2k
    s = skip_ascii_whitespaces(s, len);
79
80
14.2k
    bool is_negative = false;
81
14.2k
    if (len > 0) {
82
14.2k
        switch (*s) {
83
6.77k
        case '-':
84
6.77k
            is_negative = true;
85
6.77k
            [[fallthrough]];
86
6.77k
        case '+':
87
6.77k
            ++s;
88
6.77k
            --len;
89
14.2k
        }
90
14.2k
    }
91
    // Ignore leading zeros.
92
14.2k
    bool found_value = false;
93
53.1k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
38.9k
        found_value = true;
95
38.9k
        ++s;
96
38.9k
        --len;
97
38.9k
    }
98
99
14.2k
    int found_dot = 0;
100
14.2k
    if (len > 0 && *s == '.') {
101
2.08k
        found_dot = 1;
102
2.08k
        ++s;
103
2.08k
        --len;
104
2.08k
    }
105
14.2k
    int int_part_count = 0;
106
14.2k
    int i = 0;
107
288k
    for (; i != len; ++i) {
108
274k
        const char& c = s[i];
109
274k
        if (LIKELY('0' <= c && c <= '9')) {
110
262k
            found_value = true;
111
262k
            if (!found_dot) {
112
139k
                ++int_part_count;
113
139k
            }
114
262k
        } else if (c == '.') {
115
11.9k
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
11.9k
            found_dot = 1;
120
11.9k
        } else {
121
12
            break;
122
12
        }
123
274k
    }
124
14.2k
    if (!found_value) {
125
        // '', '.'
126
11
        *result = StringParser::PARSE_FAILURE;
127
11
        return 0;
128
11
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
14.2k
    int64_t exponent = 0;
133
14.2k
    auto end_digit_index = i;
134
14.2k
    if (i != len) {
135
1
        bool negative_exponent = false;
136
1
        if (s[i] == 'e' || s[i] == 'E') {
137
0
            ++i;
138
0
            if (i != len) {
139
0
                switch (s[i]) {
140
0
                case '-':
141
0
                    negative_exponent = true;
142
0
                    [[fallthrough]];
143
0
                case '+':
144
0
                    ++i;
145
0
                }
146
0
            }
147
0
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
0
            for (; i != len; ++i) {
153
0
                const char& c = s[i];
154
0
                if (LIKELY('0' <= c && c <= '9')) {
155
0
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
0
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
0
                } else {
165
                    // '123e12abc', '123e1.2'
166
0
                    *result = StringParser::PARSE_FAILURE;
167
0
                    return 0;
168
0
                }
169
0
            }
170
0
            if (negative_exponent) {
171
0
                exponent = -exponent;
172
0
            }
173
1
        } else {
174
1
            *result = StringParser::PARSE_FAILURE;
175
1
            return 0;
176
1
        }
177
1
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
14.2k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
14.2k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
14.2k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
14.2k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
14.2k
    T int_part_number = 0;
191
14.2k
    T frac_part_number = 0;
192
14.2k
    int actual_frac_part_count = 0;
193
14.2k
    int digit_index = 0;
194
14.2k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
14.2k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
14.0k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
14.2k
                                           : result_int_part_digit_count,
202
14.2k
                                 end_digit_index);
203
14.2k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
14.2k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
0
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
14.2k
        if (digit_index != max_index &&
210
14.2k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
8
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
8
            return 0;
213
8
        }
214
        // get int part number
215
153k
        for (; digit_index != max_index; ++digit_index) {
216
139k
            if (UNLIKELY(s[digit_index] == '.')) {
217
0
                continue;
218
0
            }
219
139k
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
139k
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
14.2k
        auto total_significant_digit_count =
225
14.2k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
14.2k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
0
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
0
                                                       total_significant_digit_count);
229
0
        }
230
14.2k
    } else {
231
        // leading zeros of fraction part
232
0
        actual_frac_part_count = -result_int_part_digit_count;
233
0
    }
234
    // get fraction part number
235
148k
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
134k
        if (UNLIKELY(s[digit_index] == '.')) {
237
11.9k
            continue;
238
11.9k
        }
239
122k
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
122k
        ++actual_frac_part_count;
241
122k
    }
242
14.2k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
14.2k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
17
        if (UNLIKELY(s[digit_index] == '.')) {
249
0
            ++digit_index;
250
0
        }
251
17
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
17
            if (s[digit_index] >= '5') {
254
17
                ++frac_part_number;
255
17
                if (frac_part_number == type_scale_multiplier) {
256
0
                    frac_part_number = 0;
257
0
                    ++int_part_number;
258
0
                }
259
17
            }
260
17
        }
261
14.1k
    } else {
262
14.1k
        if (actual_frac_part_count < type_scale) {
263
2.17k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
2.17k
        }
265
14.1k
    }
266
14.2k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
0
        return 0;
269
0
    }
270
271
14.2k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
14.2k
    *result = StringParser::PARSE_SUCCESS;
273
14.2k
    return is_negative ? T(-value) : T(value);
274
14.2k
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE35EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
135k
        ParseResult* result) {
62
135k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
135k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
135k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
135k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
135k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
135k
    s = skip_ascii_whitespaces(s, len);
79
80
135k
    bool is_negative = false;
81
135k
    if (len > 0) {
82
135k
        switch (*s) {
83
20.5k
        case '-':
84
20.5k
            is_negative = true;
85
20.5k
            [[fallthrough]];
86
27.1k
        case '+':
87
27.1k
            ++s;
88
27.1k
            --len;
89
135k
        }
90
135k
    }
91
    // Ignore leading zeros.
92
135k
    bool found_value = false;
93
234k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
98.9k
        found_value = true;
95
98.9k
        ++s;
96
98.9k
        --len;
97
98.9k
    }
98
99
135k
    int found_dot = 0;
100
135k
    if (len > 0 && *s == '.') {
101
16.9k
        found_dot = 1;
102
16.9k
        ++s;
103
16.9k
        --len;
104
16.9k
    }
105
135k
    int int_part_count = 0;
106
135k
    int i = 0;
107
5.00M
    for (; i != len; ++i) {
108
4.94M
        const char& c = s[i];
109
4.94M
        if (LIKELY('0' <= c && c <= '9')) {
110
4.76M
            found_value = true;
111
4.76M
            if (!found_dot) {
112
1.76M
                ++int_part_count;
113
1.76M
            }
114
4.76M
        } else if (c == '.') {
115
105k
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
105k
            found_dot = 1;
120
105k
        } else {
121
78.0k
            break;
122
78.0k
        }
123
4.94M
    }
124
135k
    if (!found_value) {
125
        // '', '.'
126
78
        *result = StringParser::PARSE_FAILURE;
127
78
        return 0;
128
78
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
135k
    int64_t exponent = 0;
133
135k
    auto end_digit_index = i;
134
135k
    if (i != len) {
135
78.0k
        bool negative_exponent = false;
136
78.0k
        if (s[i] == 'e' || s[i] == 'E') {
137
77.9k
            ++i;
138
77.9k
            if (i != len) {
139
77.9k
                switch (s[i]) {
140
3.58k
                case '-':
141
3.58k
                    negative_exponent = true;
142
3.58k
                    [[fallthrough]];
143
70.2k
                case '+':
144
70.2k
                    ++i;
145
77.9k
                }
146
77.9k
            }
147
77.9k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
234k
            for (; i != len; ++i) {
153
156k
                const char& c = s[i];
154
156k
                if (LIKELY('0' <= c && c <= '9')) {
155
156k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
156k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
156k
                } else {
165
                    // '123e12abc', '123e1.2'
166
10
                    *result = StringParser::PARSE_FAILURE;
167
10
                    return 0;
168
10
                }
169
156k
            }
170
77.9k
            if (negative_exponent) {
171
3.58k
                exponent = -exponent;
172
3.58k
            }
173
77.9k
        } else {
174
14
            *result = StringParser::PARSE_FAILURE;
175
14
            return 0;
176
14
        }
177
78.0k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
135k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
135k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
135k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
135k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
135k
    T int_part_number = 0;
191
135k
    T frac_part_number = 0;
192
135k
    int actual_frac_part_count = 0;
193
135k
    int digit_index = 0;
194
135k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
135k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
122k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
135k
                                           : result_int_part_digit_count,
202
135k
                                 end_digit_index);
203
135k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
353k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
217k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
135k
        if (digit_index != max_index &&
210
135k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
392
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
392
            return 0;
213
392
        }
214
        // get int part number
215
2.73M
        for (; digit_index != max_index; ++digit_index) {
216
2.59M
            if (UNLIKELY(s[digit_index] == '.')) {
217
67.5k
                continue;
218
67.5k
            }
219
2.53M
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
2.53M
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
134k
        auto total_significant_digit_count =
225
134k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
134k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
64.6k
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
64.6k
                                                       total_significant_digit_count);
229
64.6k
        }
230
134k
    } else {
231
        // leading zeros of fraction part
232
172
        actual_frac_part_count = -result_int_part_digit_count;
233
172
    }
234
    // get fraction part number
235
1.92M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
1.78M
        if (UNLIKELY(s[digit_index] == '.')) {
237
34.0k
            continue;
238
34.0k
        }
239
1.75M
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
1.75M
        ++actual_frac_part_count;
241
1.75M
    }
242
135k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
135k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
21.4k
        if (UNLIKELY(s[digit_index] == '.')) {
249
862
            ++digit_index;
250
862
        }
251
21.4k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
21.2k
            if (s[digit_index] >= '5') {
254
8.99k
                ++frac_part_number;
255
8.99k
                if (frac_part_number == type_scale_multiplier) {
256
988
                    frac_part_number = 0;
257
988
                    ++int_part_number;
258
988
                }
259
8.99k
            }
260
21.2k
        }
261
113k
    } else {
262
113k
        if (actual_frac_part_count < type_scale) {
263
91.6k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
91.6k
        }
265
113k
    }
266
135k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
56
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
56
        return 0;
269
56
    }
270
271
134k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
134k
    *result = StringParser::PARSE_SUCCESS;
273
134k
    return is_negative ? T(-value) : T(value);
274
135k
}
275
276
template Int32 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL32>(
277
        const char* __restrict s, size_t len, int type_precision, int type_scale,
278
        ParseResult* result);
279
template Int64 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL64>(
280
        const char* __restrict s, size_t len, int type_precision, int type_scale,
281
        ParseResult* result);
282
template Int128 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
283
        const char* __restrict s, size_t len, int type_precision, int type_scale,
284
        ParseResult* result);
285
template Int128 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMALV2>(
286
        const char* __restrict s, size_t len, int type_precision, int type_scale,
287
        ParseResult* result);
288
template wide::Int256 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL256>(
289
        const char* __restrict s, size_t len, int type_precision, int type_scale,
290
        ParseResult* result);
291
} // end namespace doris
292
#include "common/compile_check_avoid_end.h"