Coverage Report

Created: 2026-07-02 14:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/string_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/string_parser.hpp"
19
20
#include <limits>
21
22
#include "core/extended_types.h"
23
#include "core/types.h"
24
namespace doris {
25
#include "common/compile_check_avoid_begin.h"
26
// Supported decimal number format:
27
// <decimal> ::= <whitespace>* <value> <whitespace>*
28
//
29
// <whitespace> ::= " " | "\t" | "\n" | "\r" | "\f" | "\v"
30
//
31
// <value> ::= <sign>? <significand> <exponent>?
32
//
33
// <sign> ::= "+" | "-"
34
//
35
// <significand> ::= <digits> "." <digits> | <digits> | <digits> "." | "." <digits>
36
//
37
// <digits> ::= <digit>+
38
//
39
// <digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
40
//
41
// <exponent> ::= <e_marker> <sign>? <digits>
42
//
43
// <e_marker> ::= "e" | "E"
44
//
45
// Parsing algorithm:
46
// 1. Trim spaces and the sign, then normalize the significand by skipping leading zeros and an
47
//    optional leading dot. During this scan, count digits that belong to the original integral
48
//    part (`int_part_count`) and remember where the significand ends (`end_digit_index`).
49
// 2. Parse the optional exponent. Scientific notation is handled by moving the decimal point:
50
//    `result_int_part_digit_count = int_part_count + exponent`. For example, "12.34e-1" has
51
//    int_part_count=2 and exponent=-1, so the result has one integral digit: "1.234".
52
// 3. Build the result in scaled-integer form: first collect the integral digits up to the shifted
53
//    decimal point, then collect up to `type_scale` fractional digits, padding with zeros when the
54
//    input has fewer fractional digits than the target scale.
55
// 4. If there are extra fractional digits, round half up using the first discarded digit. Finally,
56
//    check the integral digit count against `type_precision - type_scale` and return the signed
57
//    scaled integer value.
58
template <PrimitiveType P>
59
typename PrimitiveTypeTraits<P>::CppType::NativeType StringParser::string_to_decimal(
60
        const char* __restrict s, size_t len, int type_precision, int type_scale,
61
708k
        ParseResult* result) {
62
708k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
708k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
708k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
708k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
708k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
708k
    s = skip_ascii_whitespaces(s, len);
79
80
708k
    bool is_negative = false;
81
708k
    if (len > 0) {
82
707k
        switch (*s) {
83
185k
        case '-':
84
185k
            is_negative = true;
85
185k
            [[fallthrough]];
86
239k
        case '+':
87
239k
            ++s;
88
239k
            --len;
89
707k
        }
90
707k
    }
91
    // Ignore leading zeros.
92
708k
    bool found_value = false;
93
1.38M
    while (len > 0 && UNLIKELY(*s == '0')) {
94
674k
        found_value = true;
95
674k
        ++s;
96
674k
        --len;
97
674k
    }
98
99
708k
    int found_dot = 0;
100
708k
    if (len > 0 && *s == '.') {
101
169k
        found_dot = 1;
102
169k
        ++s;
103
169k
        --len;
104
169k
    }
105
708k
    int int_part_count = 0;
106
708k
    int i = 0;
107
16.4M
    for (; i != len; ++i) {
108
15.9M
        const char& c = s[i];
109
15.9M
        if (LIKELY('0' <= c && c <= '9')) {
110
15.3M
            found_value = true;
111
15.3M
            if (!found_dot) {
112
4.73M
                ++int_part_count;
113
4.73M
            }
114
15.3M
        } else if (c == '.') {
115
448k
            if (found_dot) {
116
4
                *result = StringParser::PARSE_FAILURE;
117
4
                return 0;
118
4
            }
119
448k
            found_dot = 1;
120
448k
        } else {
121
221k
            break;
122
221k
        }
123
15.9M
    }
124
708k
    if (!found_value) {
125
        // '', '.'
126
692
        *result = StringParser::PARSE_FAILURE;
127
692
        return 0;
128
692
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
707k
    int64_t exponent = 0;
133
707k
    auto end_digit_index = i;
134
707k
    if (i != len) {
135
220k
        bool negative_exponent = false;
136
220k
        if (s[i] == 'e' || s[i] == 'E') {
137
220k
            ++i;
138
220k
            if (i != len) {
139
220k
                switch (s[i]) {
140
23.2k
                case '-':
141
23.2k
                    negative_exponent = true;
142
23.2k
                    [[fallthrough]];
143
158k
                case '+':
144
158k
                    ++i;
145
220k
                }
146
220k
            }
147
220k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
12
                *result = StringParser::PARSE_FAILURE;
150
12
                return 0;
151
12
            }
152
644k
            for (; i != len; ++i) {
153
423k
                const char& c = s[i];
154
423k
                if (LIKELY('0' <= c && c <= '9')) {
155
423k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
423k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
423k
                } else {
165
                    // '123e12abc', '123e1.2'
166
44
                    *result = StringParser::PARSE_FAILURE;
167
44
                    return 0;
168
44
                }
169
423k
            }
170
220k
            if (negative_exponent) {
171
23.2k
                exponent = -exponent;
172
23.2k
            }
173
220k
        } else {
174
220
            *result = StringParser::PARSE_FAILURE;
175
220
            return 0;
176
220
        }
177
220k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
707k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
707k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
707k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
707k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
707k
    T int_part_number = 0;
191
707k
    T frac_part_number = 0;
192
707k
    int actual_frac_part_count = 0;
193
707k
    int digit_index = 0;
194
707k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
695k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
612k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
695k
                                           : result_int_part_digit_count,
202
695k
                                 end_digit_index);
203
695k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
2.40M
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
1.71M
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
695k
        if (digit_index != max_index &&
210
695k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
23.7k
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
23.7k
            return 0;
213
23.7k
        }
214
        // get int part number
215
6.99M
        for (; digit_index != max_index; ++digit_index) {
216
6.31M
            if (UNLIKELY(s[digit_index] == '.')) {
217
144k
                continue;
218
144k
            }
219
6.17M
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
6.17M
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
671k
        auto total_significant_digit_count =
225
671k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
671k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
131k
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
131k
                                                       total_significant_digit_count);
229
131k
        }
230
671k
    } else {
231
        // leading zeros of fraction part
232
11.3k
        actual_frac_part_count = -result_int_part_digit_count;
233
11.3k
    }
234
    // get fraction part number
235
7.27M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
6.58M
        if (UNLIKELY(s[digit_index] == '.')) {
237
265k
            continue;
238
265k
        }
239
6.32M
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
6.32M
        ++actual_frac_part_count;
241
6.32M
    }
242
683k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
683k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
161k
        if (UNLIKELY(s[digit_index] == '.')) {
249
6.92k
            ++digit_index;
250
6.92k
        }
251
161k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
159k
            if (s[digit_index] >= '5') {
254
65.8k
                ++frac_part_number;
255
65.8k
                if (frac_part_number == type_scale_multiplier) {
256
6.87k
                    frac_part_number = 0;
257
6.87k
                    ++int_part_number;
258
6.87k
                }
259
65.8k
            }
260
159k
        }
261
521k
    } else {
262
521k
        if (actual_frac_part_count < type_scale) {
263
389k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
389k
        }
265
521k
    }
266
683k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
144
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
144
        return 0;
269
144
    }
270
271
683k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
683k
    *result = StringParser::PARSE_SUCCESS;
273
683k
    return is_negative ? T(-value) : T(value);
274
683k
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE28EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
108k
        ParseResult* result) {
62
108k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
108k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
108k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
108k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
108k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
108k
    s = skip_ascii_whitespaces(s, len);
79
80
108k
    bool is_negative = false;
81
108k
    if (len > 0) {
82
108k
        switch (*s) {
83
50.0k
        case '-':
84
50.0k
            is_negative = true;
85
50.0k
            [[fallthrough]];
86
64.7k
        case '+':
87
64.7k
            ++s;
88
64.7k
            --len;
89
108k
        }
90
108k
    }
91
    // Ignore leading zeros.
92
108k
    bool found_value = false;
93
216k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
107k
        found_value = true;
95
107k
        ++s;
96
107k
        --len;
97
107k
    }
98
99
108k
    int found_dot = 0;
100
108k
    if (len > 0 && *s == '.') {
101
35.8k
        found_dot = 1;
102
35.8k
        ++s;
103
35.8k
        --len;
104
35.8k
    }
105
108k
    int int_part_count = 0;
106
108k
    int i = 0;
107
1.15M
    for (; i != len; ++i) {
108
1.06M
        const char& c = s[i];
109
1.06M
        if (LIKELY('0' <= c && c <= '9')) {
110
1.01M
            found_value = true;
111
1.01M
            if (!found_dot) {
112
318k
                ++int_part_count;
113
318k
            }
114
1.01M
        } else if (c == '.') {
115
29.3k
            if (found_dot) {
116
4
                *result = StringParser::PARSE_FAILURE;
117
4
                return 0;
118
4
            }
119
29.3k
            found_dot = 1;
120
29.3k
        } else {
121
19.0k
            break;
122
19.0k
        }
123
1.06M
    }
124
108k
    if (!found_value) {
125
        // '', '.'
126
292
        *result = StringParser::PARSE_FAILURE;
127
292
        return 0;
128
292
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
108k
    int64_t exponent = 0;
133
108k
    auto end_digit_index = i;
134
108k
    if (i != len) {
135
18.7k
        bool negative_exponent = false;
136
18.7k
        if (s[i] == 'e' || s[i] == 'E') {
137
18.6k
            ++i;
138
18.6k
            if (i != len) {
139
18.6k
                switch (s[i]) {
140
3.08k
                case '-':
141
3.08k
                    negative_exponent = true;
142
3.08k
                    [[fallthrough]];
143
3.08k
                case '+':
144
3.08k
                    ++i;
145
18.6k
                }
146
18.6k
            }
147
18.6k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
12
                *result = StringParser::PARSE_FAILURE;
150
12
                return 0;
151
12
            }
152
49.3k
            for (; i != len; ++i) {
153
30.7k
                const char& c = s[i];
154
30.7k
                if (LIKELY('0' <= c && c <= '9')) {
155
30.6k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
30.6k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
30.6k
                } else {
165
                    // '123e12abc', '123e1.2'
166
24
                    *result = StringParser::PARSE_FAILURE;
167
24
                    return 0;
168
24
                }
169
30.7k
            }
170
18.6k
            if (negative_exponent) {
171
3.07k
                exponent = -exponent;
172
3.07k
            }
173
18.6k
        } else {
174
120
            *result = StringParser::PARSE_FAILURE;
175
120
            return 0;
176
120
        }
177
18.7k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
108k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
108k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
108k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
108k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
108k
    T int_part_number = 0;
191
108k
    T frac_part_number = 0;
192
108k
    int actual_frac_part_count = 0;
193
108k
    int digit_index = 0;
194
108k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
108k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
64.9k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
108k
                                           : result_int_part_digit_count,
202
108k
                                 end_digit_index);
203
108k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
533k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
425k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
108k
        if (digit_index != max_index &&
210
108k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
2.64k
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
2.64k
            return 0;
213
2.64k
        }
214
        // get int part number
215
310k
        for (; digit_index != max_index; ++digit_index) {
216
204k
            if (UNLIKELY(s[digit_index] == '.')) {
217
3.20k
                continue;
218
3.20k
            }
219
201k
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
201k
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
105k
        auto total_significant_digit_count =
225
105k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
105k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
200
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
200
                                                       total_significant_digit_count);
229
200
        }
230
105k
    } else {
231
        // leading zeros of fraction part
232
96
        actual_frac_part_count = -result_int_part_digit_count;
233
96
    }
234
    // get fraction part number
235
338k
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
232k
        if (UNLIKELY(s[digit_index] == '.')) {
237
19.9k
            continue;
238
19.9k
        }
239
212k
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
212k
        ++actual_frac_part_count;
241
212k
    }
242
105k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
105k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
42.8k
        if (UNLIKELY(s[digit_index] == '.')) {
249
1.80k
            ++digit_index;
250
1.80k
        }
251
42.8k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
42.2k
            if (s[digit_index] >= '5') {
254
17.9k
                ++frac_part_number;
255
17.9k
                if (frac_part_number == type_scale_multiplier) {
256
1.71k
                    frac_part_number = 0;
257
1.71k
                    ++int_part_number;
258
1.71k
                }
259
17.9k
            }
260
42.2k
        }
261
63.0k
    } else {
262
63.0k
        if (actual_frac_part_count < type_scale) {
263
56.0k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
56.0k
        }
265
63.0k
    }
266
105k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
48
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
48
        return 0;
269
48
    }
270
271
105k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
105k
    *result = StringParser::PARSE_SUCCESS;
273
105k
    return is_negative ? T(-value) : T(value);
274
105k
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE29EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
172k
        ParseResult* result) {
62
172k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
172k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
172k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
172k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
172k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
172k
    s = skip_ascii_whitespaces(s, len);
79
80
172k
    bool is_negative = false;
81
172k
    if (len > 0) {
82
172k
        switch (*s) {
83
43.4k
        case '-':
84
43.4k
            is_negative = true;
85
43.4k
            [[fallthrough]];
86
56.7k
        case '+':
87
56.7k
            ++s;
88
56.7k
            --len;
89
172k
        }
90
172k
    }
91
    // Ignore leading zeros.
92
172k
    bool found_value = false;
93
324k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
152k
        found_value = true;
95
152k
        ++s;
96
152k
        --len;
97
152k
    }
98
99
172k
    int found_dot = 0;
100
172k
    if (len > 0 && *s == '.') {
101
47.9k
        found_dot = 1;
102
47.9k
        ++s;
103
47.9k
        --len;
104
47.9k
    }
105
172k
    int int_part_count = 0;
106
172k
    int i = 0;
107
2.81M
    for (; i != len; ++i) {
108
2.66M
        const char& c = s[i];
109
2.66M
        if (LIKELY('0' <= c && c <= '9')) {
110
2.53M
            found_value = true;
111
2.53M
            if (!found_dot) {
112
1.04M
                ++int_part_count;
113
1.04M
            }
114
2.53M
        } else if (c == '.') {
115
107k
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
107k
            found_dot = 1;
120
107k
        } else {
121
23.4k
            break;
122
23.4k
        }
123
2.66M
    }
124
172k
    if (!found_value) {
125
        // '', '.'
126
138
        *result = StringParser::PARSE_FAILURE;
127
138
        return 0;
128
138
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
172k
    int64_t exponent = 0;
133
172k
    auto end_digit_index = i;
134
172k
    if (i != len) {
135
23.3k
        bool negative_exponent = false;
136
23.3k
        if (s[i] == 'e' || s[i] == 'E') {
137
23.2k
            ++i;
138
23.2k
            if (i != len) {
139
23.2k
                switch (s[i]) {
140
7.75k
                case '-':
141
7.75k
                    negative_exponent = true;
142
7.75k
                    [[fallthrough]];
143
7.75k
                case '+':
144
7.75k
                    ++i;
145
23.2k
                }
146
23.2k
            }
147
23.2k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
64.8k
            for (; i != len; ++i) {
153
41.6k
                const char& c = s[i];
154
41.6k
                if (LIKELY('0' <= c && c <= '9')) {
155
41.6k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
41.6k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
41.6k
                } else {
165
                    // '123e12abc', '123e1.2'
166
0
                    *result = StringParser::PARSE_FAILURE;
167
0
                    return 0;
168
0
                }
169
41.6k
            }
170
23.2k
            if (negative_exponent) {
171
7.75k
                exponent = -exponent;
172
7.75k
            }
173
23.2k
        } else {
174
46
            *result = StringParser::PARSE_FAILURE;
175
46
            return 0;
176
46
        }
177
23.3k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
172k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
172k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
172k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
172k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
172k
    T int_part_number = 0;
191
172k
    T frac_part_number = 0;
192
172k
    int actual_frac_part_count = 0;
193
172k
    int digit_index = 0;
194
172k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
167k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
153k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
167k
                                           : result_int_part_digit_count,
202
167k
                                 end_digit_index);
203
167k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
592k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
425k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
167k
        if (digit_index != max_index &&
210
167k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
20.5k
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
20.5k
            return 0;
213
20.5k
        }
214
        // get int part number
215
820k
        for (; digit_index != max_index; ++digit_index) {
216
673k
            if (UNLIKELY(s[digit_index] == '.')) {
217
1.92k
                continue;
218
1.92k
            }
219
671k
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
671k
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
147k
        auto total_significant_digit_count =
225
147k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
147k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
152
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
152
                                                       total_significant_digit_count);
229
152
        }
230
147k
    } else {
231
        // leading zeros of fraction part
232
4.78k
        actual_frac_part_count = -result_int_part_digit_count;
233
4.78k
    }
234
    // get fraction part number
235
1.18M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
1.03M
        if (UNLIKELY(s[digit_index] == '.')) {
237
81.3k
            continue;
238
81.3k
        }
239
951k
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
951k
        ++actual_frac_part_count;
241
951k
    }
242
151k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
151k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
40.1k
        if (UNLIKELY(s[digit_index] == '.')) {
249
1.70k
            ++digit_index;
250
1.70k
        }
251
40.1k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
39.7k
            if (s[digit_index] >= '5') {
254
15.9k
                ++frac_part_number;
255
15.9k
                if (frac_part_number == type_scale_multiplier) {
256
1.67k
                    frac_part_number = 0;
257
1.67k
                    ++int_part_number;
258
1.67k
                }
259
15.9k
            }
260
39.7k
        }
261
111k
    } else {
262
111k
        if (actual_frac_part_count < type_scale) {
263
63.5k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
63.5k
        }
265
111k
    }
266
151k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
32
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
32
        return 0;
269
32
    }
270
271
151k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
151k
    *result = StringParser::PARSE_SUCCESS;
273
151k
    return is_negative ? T(-value) : T(value);
274
151k
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE30EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
165k
        ParseResult* result) {
62
165k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
165k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
165k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
165k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
165k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
165k
    s = skip_ascii_whitespaces(s, len);
79
80
165k
    bool is_negative = false;
81
165k
    if (len > 0) {
82
165k
        switch (*s) {
83
43.4k
        case '-':
84
43.4k
            is_negative = true;
85
43.4k
            [[fallthrough]];
86
56.7k
        case '+':
87
56.7k
            ++s;
88
56.7k
            --len;
89
165k
        }
90
165k
    }
91
    // Ignore leading zeros.
92
165k
    bool found_value = false;
93
315k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
149k
        found_value = true;
95
149k
        ++s;
96
149k
        --len;
97
149k
    }
98
99
165k
    int found_dot = 0;
100
165k
    if (len > 0 && *s == '.') {
101
50.1k
        found_dot = 1;
102
50.1k
        ++s;
103
50.1k
        --len;
104
50.1k
    }
105
165k
    int int_part_count = 0;
106
165k
    int i = 0;
107
4.12M
    for (; i != len; ++i) {
108
3.98M
        const char& c = s[i];
109
3.98M
        if (LIKELY('0' <= c && c <= '9')) {
110
3.85M
            found_value = true;
111
3.85M
            if (!found_dot) {
112
1.12M
                ++int_part_count;
113
1.12M
            }
114
3.85M
        } else if (c == '.') {
115
99.8k
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
99.8k
            found_dot = 1;
120
99.8k
        } else {
121
24.9k
            break;
122
24.9k
        }
123
3.98M
    }
124
165k
    if (!found_value) {
125
        // '', '.'
126
110
        *result = StringParser::PARSE_FAILURE;
127
110
        return 0;
128
110
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
165k
    int64_t exponent = 0;
133
165k
    auto end_digit_index = i;
134
165k
    if (i != len) {
135
24.8k
        bool negative_exponent = false;
136
24.8k
        if (s[i] == 'e' || s[i] == 'E') {
137
24.8k
            ++i;
138
24.8k
            if (i != len) {
139
24.8k
                switch (s[i]) {
140
9.33k
                case '-':
141
9.33k
                    negative_exponent = true;
142
9.33k
                    [[fallthrough]];
143
9.34k
                case '+':
144
9.34k
                    ++i;
145
24.8k
                }
146
24.8k
            }
147
24.8k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
71.6k
            for (; i != len; ++i) {
153
46.8k
                const char& c = s[i];
154
46.8k
                if (LIKELY('0' <= c && c <= '9')) {
155
46.8k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
46.8k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
46.8k
                } else {
165
                    // '123e12abc', '123e1.2'
166
0
                    *result = StringParser::PARSE_FAILURE;
167
0
                    return 0;
168
0
                }
169
46.8k
            }
170
24.8k
            if (negative_exponent) {
171
9.33k
                exponent = -exponent;
172
9.33k
            }
173
24.8k
        } else {
174
24
            *result = StringParser::PARSE_FAILURE;
175
24
            return 0;
176
24
        }
177
24.8k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
165k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
165k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
165k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
165k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
165k
    T int_part_number = 0;
191
165k
    T frac_part_number = 0;
192
165k
    int actual_frac_part_count = 0;
193
165k
    int digit_index = 0;
194
165k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
159k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
146k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
159k
                                           : result_int_part_digit_count,
202
159k
                                 end_digit_index);
203
159k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
587k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
427k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
159k
        if (digit_index != max_index &&
210
159k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
280
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
280
            return 0;
213
280
        }
214
        // get int part number
215
1.19M
        for (; digit_index != max_index; ++digit_index) {
216
1.04M
            if (UNLIKELY(s[digit_index] == '.')) {
217
1.92k
                continue;
218
1.92k
            }
219
1.03M
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
1.03M
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
158k
        auto total_significant_digit_count =
225
158k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
158k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
160
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
160
                                                       total_significant_digit_count);
229
160
        }
230
158k
    } else {
231
        // leading zeros of fraction part
232
6.35k
        actual_frac_part_count = -result_int_part_digit_count;
233
6.35k
    }
234
    // get fraction part number
235
2.43M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
2.26M
        if (UNLIKELY(s[digit_index] == '.')) {
237
92.6k
            continue;
238
92.6k
        }
239
2.17M
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
2.17M
        ++actual_frac_part_count;
241
2.17M
    }
242
165k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
165k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
40.3k
        if (UNLIKELY(s[digit_index] == '.')) {
249
1.70k
            ++digit_index;
250
1.70k
        }
251
40.3k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
39.9k
            if (s[digit_index] >= '5') {
254
16.0k
                ++frac_part_number;
255
16.0k
                if (frac_part_number == type_scale_multiplier) {
256
1.81k
                    frac_part_number = 0;
257
1.81k
                    ++int_part_number;
258
1.81k
                }
259
16.0k
            }
260
39.9k
        }
261
124k
    } else {
262
124k
        if (actual_frac_part_count < type_scale) {
263
90.0k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
90.0k
        }
265
124k
    }
266
165k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
32
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
32
        return 0;
269
32
    }
270
271
165k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
165k
    *result = StringParser::PARSE_SUCCESS;
273
165k
    return is_negative ? T(-value) : T(value);
274
165k
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE20EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
27.1k
        ParseResult* result) {
62
27.1k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
27.1k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
27.1k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
27.1k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
27.1k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
27.1k
    s = skip_ascii_whitespaces(s, len);
79
80
27.1k
    bool is_negative = false;
81
27.1k
    if (len > 0) {
82
27.1k
        switch (*s) {
83
13.3k
        case '-':
84
13.3k
            is_negative = true;
85
13.3k
            [[fallthrough]];
86
13.3k
        case '+':
87
13.3k
            ++s;
88
13.3k
            --len;
89
27.1k
        }
90
27.1k
    }
91
    // Ignore leading zeros.
92
27.1k
    bool found_value = false;
93
104k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
77.6k
        found_value = true;
95
77.6k
        ++s;
96
77.6k
        --len;
97
77.6k
    }
98
99
27.1k
    int found_dot = 0;
100
27.1k
    if (len > 0 && *s == '.') {
101
4.00k
        found_dot = 1;
102
4.00k
        ++s;
103
4.00k
        --len;
104
4.00k
    }
105
27.1k
    int int_part_count = 0;
106
27.1k
    int i = 0;
107
560k
    for (; i != len; ++i) {
108
532k
        const char& c = s[i];
109
532k
        if (LIKELY('0' <= c && c <= '9')) {
110
510k
            found_value = true;
111
510k
            if (!found_dot) {
112
273k
                ++int_part_count;
113
273k
            }
114
510k
        } else if (c == '.') {
115
22.8k
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
22.8k
            found_dot = 1;
120
22.8k
        } else {
121
22
            break;
122
22
        }
123
532k
    }
124
27.1k
    if (!found_value) {
125
        // '', '.'
126
20
        *result = StringParser::PARSE_FAILURE;
127
20
        return 0;
128
20
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
27.0k
    int64_t exponent = 0;
133
27.0k
    auto end_digit_index = i;
134
27.0k
    if (i != len) {
135
2
        bool negative_exponent = false;
136
2
        if (s[i] == 'e' || s[i] == 'E') {
137
0
            ++i;
138
0
            if (i != len) {
139
0
                switch (s[i]) {
140
0
                case '-':
141
0
                    negative_exponent = true;
142
0
                    [[fallthrough]];
143
0
                case '+':
144
0
                    ++i;
145
0
                }
146
0
            }
147
0
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
0
            for (; i != len; ++i) {
153
0
                const char& c = s[i];
154
0
                if (LIKELY('0' <= c && c <= '9')) {
155
0
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
0
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
0
                } else {
165
                    // '123e12abc', '123e1.2'
166
0
                    *result = StringParser::PARSE_FAILURE;
167
0
                    return 0;
168
0
                }
169
0
            }
170
0
            if (negative_exponent) {
171
0
                exponent = -exponent;
172
0
            }
173
2
        } else {
174
2
            *result = StringParser::PARSE_FAILURE;
175
2
            return 0;
176
2
        }
177
2
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
27.0k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
27.0k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
27.0k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
27.0k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
27.0k
    T int_part_number = 0;
191
27.0k
    T frac_part_number = 0;
192
27.0k
    int actual_frac_part_count = 0;
193
27.0k
    int digit_index = 0;
194
27.0k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
27.0k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
26.8k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
27.0k
                                           : result_int_part_digit_count,
202
27.0k
                                 end_digit_index);
203
27.0k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
27.0k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
0
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
27.0k
        if (digit_index != max_index &&
210
27.0k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
16
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
16
            return 0;
213
16
        }
214
        // get int part number
215
300k
        for (; digit_index != max_index; ++digit_index) {
216
272k
            if (UNLIKELY(s[digit_index] == '.')) {
217
0
                continue;
218
0
            }
219
272k
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
272k
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
27.0k
        auto total_significant_digit_count =
225
27.0k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
27.0k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
0
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
0
                                                       total_significant_digit_count);
229
0
        }
230
27.0k
    } else {
231
        // leading zeros of fraction part
232
0
        actual_frac_part_count = -result_int_part_digit_count;
233
0
    }
234
    // get fraction part number
235
286k
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
259k
        if (UNLIKELY(s[digit_index] == '.')) {
237
22.8k
            continue;
238
22.8k
        }
239
236k
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
236k
        ++actual_frac_part_count;
241
236k
    }
242
27.0k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
27.0k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
34
        if (UNLIKELY(s[digit_index] == '.')) {
249
0
            ++digit_index;
250
0
        }
251
34
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
34
            if (s[digit_index] >= '5') {
254
34
                ++frac_part_number;
255
34
                if (frac_part_number == type_scale_multiplier) {
256
0
                    frac_part_number = 0;
257
0
                    ++int_part_number;
258
0
                }
259
34
            }
260
34
        }
261
27.0k
    } else {
262
27.0k
        if (actual_frac_part_count < type_scale) {
263
3.89k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
3.89k
        }
265
27.0k
    }
266
27.0k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
0
        return 0;
269
0
    }
270
271
27.0k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
27.0k
    *result = StringParser::PARSE_SUCCESS;
273
27.0k
    return is_negative ? T(-value) : T(value);
274
27.0k
}
_ZN5doris12StringParser17string_to_decimalILNS_13PrimitiveTypeE35EEENS_19PrimitiveTypeTraitsIXT_EE7CppType10NativeTypeEPKcmiiPNS0_11ParseResultE
Line
Count
Source
61
233k
        ParseResult* result) {
62
233k
    using T = typename PrimitiveTypeTraits<P>::CppType::NativeType;
63
233k
    static_assert(std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t> ||
64
233k
                          std::is_same_v<T, __int128> || std::is_same_v<T, wide::Int256>,
65
233k
                  "Cast string to decimal only support target type int32_t, int64_t, __int128 or "
66
233k
                  "wide::Int256.");
67
68
    // Parse in two logical coordinate systems:
69
    // 1. `s[0, end_digit_index)` is the normalized significand after trimming spaces, sign and
70
    //    leading zeros. If the original value starts with '.', the dot is also skipped so
71
    //    ".14E+3" is parsed as significand "14" with exponent 3.
72
    // 2. `result_int_part_digit_count = int_part_count + exponent` is the decimal point position
73
    //    after applying scientific notation. For example, "1.4E+2" has int_part_count=1,
74
    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 140.
75
    // `digit_index` always indexes the normalized significand string, which may still contain a
76
    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot explicitly.
77
    // Ignore leading and trailing spaces.
78
233k
    s = skip_ascii_whitespaces(s, len);
79
80
233k
    bool is_negative = false;
81
233k
    if (len > 0) {
82
233k
        switch (*s) {
83
34.6k
        case '-':
84
34.6k
            is_negative = true;
85
34.6k
            [[fallthrough]];
86
47.9k
        case '+':
87
47.9k
            ++s;
88
47.9k
            --len;
89
233k
        }
90
233k
    }
91
    // Ignore leading zeros.
92
233k
    bool found_value = false;
93
421k
    while (len > 0 && UNLIKELY(*s == '0')) {
94
187k
        found_value = true;
95
187k
        ++s;
96
187k
        --len;
97
187k
    }
98
99
233k
    int found_dot = 0;
100
233k
    if (len > 0 && *s == '.') {
101
31.6k
        found_dot = 1;
102
31.6k
        ++s;
103
31.6k
        --len;
104
31.6k
    }
105
233k
    int int_part_count = 0;
106
233k
    int i = 0;
107
7.82M
    for (; i != len; ++i) {
108
7.74M
        const char& c = s[i];
109
7.74M
        if (LIKELY('0' <= c && c <= '9')) {
110
7.39M
            found_value = true;
111
7.39M
            if (!found_dot) {
112
1.98M
                ++int_part_count;
113
1.98M
            }
114
7.39M
        } else if (c == '.') {
115
189k
            if (found_dot) {
116
0
                *result = StringParser::PARSE_FAILURE;
117
0
                return 0;
118
0
            }
119
189k
            found_dot = 1;
120
189k
        } else {
121
153k
            break;
122
153k
        }
123
7.74M
    }
124
233k
    if (!found_value) {
125
        // '', '.'
126
132
        *result = StringParser::PARSE_FAILURE;
127
132
        return 0;
128
132
    }
129
    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so later digit counts
130
    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after "1.4", not after
131
    // "E+2".
132
233k
    int64_t exponent = 0;
133
233k
    auto end_digit_index = i;
134
233k
    if (i != len) {
135
153k
        bool negative_exponent = false;
136
153k
        if (s[i] == 'e' || s[i] == 'E') {
137
153k
            ++i;
138
153k
            if (i != len) {
139
153k
                switch (s[i]) {
140
3.07k
                case '-':
141
3.07k
                    negative_exponent = true;
142
3.07k
                    [[fallthrough]];
143
138k
                case '+':
144
138k
                    ++i;
145
153k
                }
146
153k
            }
147
153k
            if (i == len) {
148
                // '123e', '123e+', '123e-'
149
0
                *result = StringParser::PARSE_FAILURE;
150
0
                return 0;
151
0
            }
152
458k
            for (; i != len; ++i) {
153
304k
                const char& c = s[i];
154
304k
                if (LIKELY('0' <= c && c <= '9')) {
155
304k
                    exponent = exponent * 10 + (c - '0');
156
                    // max string len is config::string_type_length_soft_limit_bytes,
157
                    // whose max value is std::numeric_limits<int32_t>::max() - 4,
158
                    // just check overflow of int32_t to simplify the logic
159
                    // For edge cases like 0.{2147483647 zeros}e+2147483647
160
304k
                    if (exponent > std::numeric_limits<int32_t>::max()) {
161
0
                        *result = StringParser::PARSE_OVERFLOW;
162
0
                        return 0;
163
0
                    }
164
304k
                } else {
165
                    // '123e12abc', '123e1.2'
166
20
                    *result = StringParser::PARSE_FAILURE;
167
20
                    return 0;
168
20
                }
169
304k
            }
170
153k
            if (negative_exponent) {
171
3.07k
                exponent = -exponent;
172
3.07k
            }
173
153k
        } else {
174
28
            *result = StringParser::PARSE_FAILURE;
175
28
            return 0;
176
28
        }
177
153k
    }
178
    // TODO: check limit values of exponent and add UT
179
    // max string len is config::string_type_length_soft_limit_bytes,
180
    // whose max value is std::numeric_limits<int32_t>::max() - 4,
181
    // so int_part_count will be in range of int32_t,
182
    // and int_part_count + exponent will be in range of int64_t
183
233k
    int64_t tmp_result_int_part_digit_count = int_part_count + exponent;
184
233k
    if (tmp_result_int_part_digit_count > std::numeric_limits<int>::max() ||
185
233k
        tmp_result_int_part_digit_count < std::numeric_limits<int>::min()) {
186
0
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
187
0
        return 0;
188
0
    }
189
233k
    int result_int_part_digit_count = tmp_result_int_part_digit_count;
190
233k
    T int_part_number = 0;
191
233k
    T frac_part_number = 0;
192
233k
    int actual_frac_part_count = 0;
193
233k
    int digit_index = 0;
194
233k
    if (result_int_part_digit_count >= 0) {
195
        // `max_index` is the raw significand index where integer-part digits stop. Add one extra
196
        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" must scan "1.4" to
197
        // collect three integer digits after the exponent shift. It is capped by end_digit_index
198
        // because missing digits are appended later by multiplying with powers of 10.
199
233k
        int max_index = std::min(found_dot ? (result_int_part_digit_count +
200
220k
                                              ((int_part_count > 0 && exponent > 0) ? 1 : 0))
201
233k
                                           : result_int_part_digit_count,
202
233k
                                 end_digit_index);
203
233k
        max_index = (max_index == std::numeric_limits<int>::min() ? end_digit_index : max_index);
204
        // skip zero number
205
665k
        for (; digit_index != max_index && s[digit_index] == '0'; ++digit_index) {
206
432k
        }
207
        // test 0.00, .00, 0.{00...}e2147483647
208
        // 0.00000e2147483647
209
233k
        if (digit_index != max_index &&
210
233k
            (result_int_part_digit_count - digit_index > type_precision - type_scale)) {
211
224
            *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
212
224
            return 0;
213
224
        }
214
        // get int part number
215
4.36M
        for (; digit_index != max_index; ++digit_index) {
216
4.12M
            if (UNLIKELY(s[digit_index] == '.')) {
217
137k
                continue;
218
137k
            }
219
3.99M
            int_part_number = int_part_number * 10 + (s[digit_index] - '0');
220
3.99M
        }
221
        // Count only significand digits, not exponent syntax. If the exponent moves the decimal
222
        // point past all available significant digits, append zeros by scaling the integer part:
223
        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then multiplies by 10.
224
232k
        auto total_significant_digit_count =
225
232k
                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
226
232k
        if (result_int_part_digit_count > total_significant_digit_count) {
227
131k
            int_part_number *= get_scale_multiplier<T>(result_int_part_digit_count -
228
131k
                                                       total_significant_digit_count);
229
131k
        }
230
232k
    } else {
231
        // leading zeros of fraction part
232
96
        actual_frac_part_count = -result_int_part_digit_count;
233
96
    }
234
    // get fraction part number
235
3.02M
    for (; digit_index != end_digit_index && actual_frac_part_count < type_scale; ++digit_index) {
236
2.79M
        if (UNLIKELY(s[digit_index] == '.')) {
237
48.2k
            continue;
238
48.2k
        }
239
2.74M
        frac_part_number = frac_part_number * 10 + (s[digit_index] - '0');
240
2.74M
        ++actual_frac_part_count;
241
2.74M
    }
242
233k
    auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
243
    // Round only when the next parsed significand digit is exactly the first discarded fractional
244
    // digit. If `actual_frac_part_count` is already greater than type_scale, the missing positions
245
    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 must stay 0 instead of
246
    // rounding up.
247
233k
    if (actual_frac_part_count == type_scale && digit_index != end_digit_index) {
248
38.1k
        if (UNLIKELY(s[digit_index] == '.')) {
249
1.70k
            ++digit_index;
250
1.70k
        }
251
38.1k
        if (digit_index != end_digit_index) {
252
            // example: test 1.5 -> decimal(1, 0)
253
37.7k
            if (s[digit_index] >= '5') {
254
15.8k
                ++frac_part_number;
255
15.8k
                if (frac_part_number == type_scale_multiplier) {
256
1.67k
                    frac_part_number = 0;
257
1.67k
                    ++int_part_number;
258
1.67k
                }
259
15.8k
            }
260
37.7k
        }
261
194k
    } else {
262
194k
        if (actual_frac_part_count < type_scale) {
263
176k
            frac_part_number *= get_scale_multiplier<T>(type_scale - actual_frac_part_count);
264
176k
        }
265
194k
    }
266
233k
    if (int_part_number >= get_scale_multiplier<T>(type_precision - type_scale)) {
267
32
        *result = is_negative ? StringParser::PARSE_UNDERFLOW : StringParser::PARSE_OVERFLOW;
268
32
        return 0;
269
32
    }
270
271
233k
    T value = int_part_number * type_scale_multiplier + frac_part_number;
272
233k
    *result = StringParser::PARSE_SUCCESS;
273
233k
    return is_negative ? T(-value) : T(value);
274
233k
}
275
276
template Int32 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL32>(
277
        const char* __restrict s, size_t len, int type_precision, int type_scale,
278
        ParseResult* result);
279
template Int64 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL64>(
280
        const char* __restrict s, size_t len, int type_precision, int type_scale,
281
        ParseResult* result);
282
template Int128 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL128I>(
283
        const char* __restrict s, size_t len, int type_precision, int type_scale,
284
        ParseResult* result);
285
template Int128 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMALV2>(
286
        const char* __restrict s, size_t len, int type_precision, int type_scale,
287
        ParseResult* result);
288
template wide::Int256 StringParser::string_to_decimal<PrimitiveType::TYPE_DECIMAL256>(
289
        const char* __restrict s, size_t len, int type_precision, int type_scale,
290
        ParseResult* result);
291
} // end namespace doris
292
#include "common/compile_check_avoid_end.h"