Coverage Report

Created: 2026-04-06 09:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/bit_packing.inline.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
// the implement of BitPacking is from impala
19
20
#include <boost/preprocessor/repetition/repeat_from_to.hpp>
21
22
#include "util/bit_packing.h"
23
24
namespace doris {
25
#include "common/compile_check_begin.h"
26
22.8k
inline int64_t BitPacking::NumValuesToUnpack(int bit_width, int64_t in_bytes, int64_t num_values) {
27
    // Check if we have enough input bytes to decode 'num_values'.
28
22.8k
    if (bit_width == 0 ||
29
22.8k
        BitUtil::RoundUpNumBytes((uint32_t)(num_values * bit_width)) <= in_bytes) {
30
        // Limited by output space.
31
22.8k
        return num_values;
32
22.8k
    } else {
33
        // Limited by the number of input bytes. Compute the number of values that can be
34
        // unpacked from the input.
35
41
        return (in_bytes * CHAR_BIT) / bit_width;
36
41
    }
37
22.8k
}
38
39
0
constexpr uint64_t GetMask(int num_bits) {
40
0
    if (num_bits >= 64) {
41
0
        return ~0L;
42
0
    }
43
0
    return (1ULL << num_bits) - 1;
44
0
}
45
46
template <typename T>
47
0
constexpr bool IsSupportedUnpackingType() {
48
0
    return std::is_same<T, uint8_t>::value || std::is_same<T, uint16_t>::value ||
49
0
           std::is_same<T, uint32_t>::value || std::is_same<T, uint64_t>::value;
50
0
}
Unexecuted instantiation: _ZN5doris24IsSupportedUnpackingTypeIjEEbv
Unexecuted instantiation: _ZN5doris24IsSupportedUnpackingTypeIhEEbv
51
52
template <typename OutType>
53
std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width,
54
                                                            const uint8_t* __restrict__ in,
55
                                                            int64_t in_bytes, int64_t num_values,
56
22.8k
                                                            OutType* __restrict__ out) {
57
22.8k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
58
59
22.8k
#pragma push_macro("UNPACK_VALUES_CASE")
60
22.8k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
61
22.8k
    case i:                                     \
62
22.8k
        return UnpackValues<OutType, i>(in, in_bytes, num_values, out);
63
64
22.8k
    switch (bit_width) {
65
        // Expand cases from 0 to 64.
66
22.8k
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
67
0
    default:
68
0
        DCHECK(false);
69
0
        return std::make_pair(nullptr, -1);
70
22.8k
    }
71
22.8k
#pragma pop_macro("UNPACK_VALUES_CASE")
72
22.8k
}
_ZN5doris10BitPacking12UnpackValuesIjEESt4pairIPKhlEiS4_llPT_
Line
Count
Source
56
22.7k
                                                            OutType* __restrict__ out) {
57
22.7k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
58
59
22.7k
#pragma push_macro("UNPACK_VALUES_CASE")
60
22.7k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
61
22.7k
    case i:                                     \
62
22.7k
        return UnpackValues<OutType, i>(in, in_bytes, num_values, out);
63
64
22.7k
    switch (bit_width) {
65
        // Expand cases from 0 to 64.
66
0
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
67
0
    default:
68
        DCHECK(false);
69
0
        return std::make_pair(nullptr, -1);
70
22.7k
    }
71
22.7k
#pragma pop_macro("UNPACK_VALUES_CASE")
72
22.7k
}
_ZN5doris10BitPacking12UnpackValuesIhEESt4pairIPKhlEiS4_llPT_
Line
Count
Source
56
176
                                                            OutType* __restrict__ out) {
57
176
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
58
59
176
#pragma push_macro("UNPACK_VALUES_CASE")
60
176
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
61
176
    case i:                                     \
62
176
        return UnpackValues<OutType, i>(in, in_bytes, num_values, out);
63
64
176
    switch (bit_width) {
65
        // Expand cases from 0 to 64.
66
0
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
67
0
    default:
68
        DCHECK(false);
69
0
        return std::make_pair(nullptr, -1);
70
176
    }
71
176
#pragma pop_macro("UNPACK_VALUES_CASE")
72
176
}
73
74
template <typename OutType, int BIT_WIDTH>
75
std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(const uint8_t* __restrict__ in,
76
                                                            int64_t in_bytes, int64_t num_values,
77
22.8k
                                                            OutType* __restrict__ out) {
78
22.8k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
22.8k
    constexpr int BATCH_SIZE = 32;
81
22.8k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
22.8k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
22.8k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
22.8k
    const uint8_t* in_pos = in;
85
22.8k
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
184k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
161k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
161k
        out_pos += BATCH_SIZE;
91
161k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
161k
    }
93
94
    // Then unpack the final partial batch.
95
22.8k
    if (remainder_values > 0) {
96
11.4k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
11.4k
                                                        out_pos);
98
11.4k
    }
99
22.8k
    return std::make_pair(in_pos, values_to_read);
100
22.8k
}
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi0EEESt4pairIPKhlES4_llPT_
_ZN5doris10BitPacking12UnpackValuesIjLi1EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
82
                                                            OutType* __restrict__ out) {
78
82
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
82
    constexpr int BATCH_SIZE = 32;
81
82
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
82
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
82
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
82
    const uint8_t* in_pos = in;
85
82
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
90
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
8
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
8
        out_pos += BATCH_SIZE;
91
8
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
8
    }
93
94
    // Then unpack the final partial batch.
95
82
    if (remainder_values > 0) {
96
74
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
74
                                                        out_pos);
98
74
    }
99
82
    return std::make_pair(in_pos, values_to_read);
100
82
}
_ZN5doris10BitPacking12UnpackValuesIjLi2EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
134
                                                            OutType* __restrict__ out) {
78
134
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
134
    constexpr int BATCH_SIZE = 32;
81
134
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
134
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
134
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
134
    const uint8_t* in_pos = in;
85
134
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
150
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
16
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
16
        out_pos += BATCH_SIZE;
91
16
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
16
    }
93
94
    // Then unpack the final partial batch.
95
134
    if (remainder_values > 0) {
96
124
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
124
                                                        out_pos);
98
124
    }
99
134
    return std::make_pair(in_pos, values_to_read);
100
134
}
_ZN5doris10BitPacking12UnpackValuesIjLi3EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
796
                                                            OutType* __restrict__ out) {
78
796
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
796
    constexpr int BATCH_SIZE = 32;
81
796
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
796
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
796
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
796
    const uint8_t* in_pos = in;
85
796
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
4.80k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
4.00k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
4.00k
        out_pos += BATCH_SIZE;
91
4.00k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
4.00k
    }
93
94
    // Then unpack the final partial batch.
95
796
    if (remainder_values > 0) {
96
465
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
465
                                                        out_pos);
98
465
    }
99
796
    return std::make_pair(in_pos, values_to_read);
100
796
}
_ZN5doris10BitPacking12UnpackValuesIjLi4EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
234
                                                            OutType* __restrict__ out) {
78
234
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
234
    constexpr int BATCH_SIZE = 32;
81
234
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
234
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
234
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
234
    const uint8_t* in_pos = in;
85
234
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
612
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
378
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
378
        out_pos += BATCH_SIZE;
91
378
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
378
    }
93
94
    // Then unpack the final partial batch.
95
234
    if (remainder_values > 0) {
96
194
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
194
                                                        out_pos);
98
194
    }
99
234
    return std::make_pair(in_pos, values_to_read);
100
234
}
_ZN5doris10BitPacking12UnpackValuesIjLi5EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
538
                                                            OutType* __restrict__ out) {
78
538
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
538
    constexpr int BATCH_SIZE = 32;
81
538
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
538
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
538
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
538
    const uint8_t* in_pos = in;
85
538
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
3.55k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
3.01k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
3.01k
        out_pos += BATCH_SIZE;
91
3.01k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
3.01k
    }
93
94
    // Then unpack the final partial batch.
95
538
    if (remainder_values > 0) {
96
276
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
276
                                                        out_pos);
98
276
    }
99
538
    return std::make_pair(in_pos, values_to_read);
100
538
}
_ZN5doris10BitPacking12UnpackValuesIjLi6EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
156
                                                            OutType* __restrict__ out) {
78
156
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
156
    constexpr int BATCH_SIZE = 32;
81
156
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
156
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
156
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
156
    const uint8_t* in_pos = in;
85
156
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
742
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
586
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
586
        out_pos += BATCH_SIZE;
91
586
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
586
    }
93
94
    // Then unpack the final partial batch.
95
156
    if (remainder_values > 0) {
96
76
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
76
                                                        out_pos);
98
76
    }
99
156
    return std::make_pair(in_pos, values_to_read);
100
156
}
_ZN5doris10BitPacking12UnpackValuesIjLi7EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
82
                                                            OutType* __restrict__ out) {
78
82
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
82
    constexpr int BATCH_SIZE = 32;
81
82
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
82
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
82
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
82
    const uint8_t* in_pos = in;
85
82
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
205
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
123
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
123
        out_pos += BATCH_SIZE;
91
123
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
123
    }
93
94
    // Then unpack the final partial batch.
95
82
    if (remainder_values > 0) {
96
41
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
41
                                                        out_pos);
98
41
    }
99
82
    return std::make_pair(in_pos, values_to_read);
100
82
}
_ZN5doris10BitPacking12UnpackValuesIjLi8EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
2.55k
                                                            OutType* __restrict__ out) {
78
2.55k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
2.55k
    constexpr int BATCH_SIZE = 32;
81
2.55k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
2.55k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
2.55k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
2.55k
    const uint8_t* in_pos = in;
85
2.55k
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
21.3k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
18.7k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
18.7k
        out_pos += BATCH_SIZE;
91
18.7k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
18.7k
    }
93
94
    // Then unpack the final partial batch.
95
2.55k
    if (remainder_values > 0) {
96
1.25k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
1.25k
                                                        out_pos);
98
1.25k
    }
99
2.55k
    return std::make_pair(in_pos, values_to_read);
100
2.55k
}
_ZN5doris10BitPacking12UnpackValuesIjLi9EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
668
                                                            OutType* __restrict__ out) {
78
668
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
668
    constexpr int BATCH_SIZE = 32;
81
668
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
668
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
668
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
668
    const uint8_t* in_pos = in;
85
668
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
5.50k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
4.83k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
4.83k
        out_pos += BATCH_SIZE;
91
4.83k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
4.83k
    }
93
94
    // Then unpack the final partial batch.
95
668
    if (remainder_values > 0) {
96
314
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
314
                                                        out_pos);
98
314
    }
99
668
    return std::make_pair(in_pos, values_to_read);
100
668
}
_ZN5doris10BitPacking12UnpackValuesIjLi10EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
1.79k
                                                            OutType* __restrict__ out) {
78
1.79k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
1.79k
    constexpr int BATCH_SIZE = 32;
81
1.79k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
1.79k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
1.79k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
1.79k
    const uint8_t* in_pos = in;
85
1.79k
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
15.0k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
13.2k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
13.2k
        out_pos += BATCH_SIZE;
91
13.2k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
13.2k
    }
93
94
    // Then unpack the final partial batch.
95
1.79k
    if (remainder_values > 0) {
96
884
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
884
                                                        out_pos);
98
884
    }
99
1.79k
    return std::make_pair(in_pos, values_to_read);
100
1.79k
}
_ZN5doris10BitPacking12UnpackValuesIjLi11EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
5.23k
                                                            OutType* __restrict__ out) {
78
5.23k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
5.23k
    constexpr int BATCH_SIZE = 32;
81
5.23k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
5.23k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
5.23k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
5.23k
    const uint8_t* in_pos = in;
85
5.23k
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
43.4k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
38.1k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
38.1k
        out_pos += BATCH_SIZE;
91
38.1k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
38.1k
    }
93
94
    // Then unpack the final partial batch.
95
5.23k
    if (remainder_values > 0) {
96
2.51k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
2.51k
                                                        out_pos);
98
2.51k
    }
99
5.23k
    return std::make_pair(in_pos, values_to_read);
100
5.23k
}
_ZN5doris10BitPacking12UnpackValuesIjLi12EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
9.67k
                                                            OutType* __restrict__ out) {
78
9.67k
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
9.67k
    constexpr int BATCH_SIZE = 32;
81
9.67k
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
9.67k
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
9.67k
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
9.67k
    const uint8_t* in_pos = in;
85
9.67k
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
81.4k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
71.7k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
71.7k
        out_pos += BATCH_SIZE;
91
71.7k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
71.7k
    }
93
94
    // Then unpack the final partial batch.
95
9.67k
    if (remainder_values > 0) {
96
4.78k
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
4.78k
                                                        out_pos);
98
4.78k
    }
99
9.67k
    return std::make_pair(in_pos, values_to_read);
100
9.67k
}
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi13EEESt4pairIPKhlES4_llPT_
_ZN5doris10BitPacking12UnpackValuesIjLi14EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
200
                                                            OutType* __restrict__ out) {
78
200
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
200
    constexpr int BATCH_SIZE = 32;
81
200
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
200
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
200
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
200
    const uint8_t* in_pos = in;
85
200
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
1.68k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
1.48k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
1.48k
        out_pos += BATCH_SIZE;
91
1.48k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
1.48k
    }
93
94
    // Then unpack the final partial batch.
95
200
    if (remainder_values > 0) {
96
99
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
99
                                                        out_pos);
98
99
    }
99
200
    return std::make_pair(in_pos, values_to_read);
100
200
}
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi15EEESt4pairIPKhlES4_llPT_
_ZN5doris10BitPacking12UnpackValuesIjLi16EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
561
                                                            OutType* __restrict__ out) {
78
561
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
561
    constexpr int BATCH_SIZE = 32;
81
561
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
561
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
561
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
561
    const uint8_t* in_pos = in;
85
561
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
4.74k
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
4.18k
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
4.18k
        out_pos += BATCH_SIZE;
91
4.18k
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
4.18k
    }
93
94
    // Then unpack the final partial batch.
95
561
    if (remainder_values > 0) {
96
280
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
280
                                                        out_pos);
98
280
    }
99
561
    return std::make_pair(in_pos, values_to_read);
100
561
}
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi17EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi18EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi19EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi20EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi21EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi22EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi23EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi24EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi25EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi26EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi27EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi28EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi29EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi30EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi31EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi32EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi33EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi34EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi35EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi36EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi37EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi38EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi39EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi40EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi41EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi42EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi43EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi44EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi45EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi46EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi47EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi48EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi49EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi50EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi51EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi52EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi53EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi54EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi55EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi56EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi57EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi58EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi59EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi60EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi61EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi62EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi63EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIjLi64EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi0EEESt4pairIPKhlES4_llPT_
_ZN5doris10BitPacking12UnpackValuesIhLi1EEESt4pairIPKhlES4_llPT_
Line
Count
Source
77
176
                                                            OutType* __restrict__ out) {
78
176
    static_assert(IsSupportedUnpackingType<OutType>(), "Only unsigned integers are supported.");
79
80
176
    constexpr int BATCH_SIZE = 32;
81
176
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
82
176
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
83
176
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
84
176
    const uint8_t* in_pos = in;
85
176
    OutType* out_pos = out;
86
87
    // First unpack as many full batches as possible.
88
739
    for (int64_t i = 0; i < batches_to_read; ++i) {
89
563
        in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
90
563
        out_pos += BATCH_SIZE;
91
563
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
92
563
    }
93
94
    // Then unpack the final partial batch.
95
176
    if (remainder_values > 0) {
96
41
        in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(in_pos, in_bytes, (int)remainder_values,
97
41
                                                        out_pos);
98
41
    }
99
176
    return std::make_pair(in_pos, values_to_read);
100
176
}
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi2EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi3EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi4EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi5EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi6EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi7EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi8EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi9EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi10EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi11EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi12EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi13EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi14EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi15EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi16EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi17EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi18EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi19EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi20EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi21EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi22EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi23EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi24EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi25EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi26EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi27EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi28EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi29EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi30EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi31EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi32EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi33EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi34EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi35EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi36EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi37EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi38EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi39EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi40EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi41EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi42EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi43EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi44EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi45EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi46EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi47EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi48EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi49EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi50EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi51EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi52EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi53EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi54EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi55EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi56EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi57EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi58EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi59EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi60EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi61EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi62EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi63EEESt4pairIPKhlES4_llPT_
Unexecuted instantiation: _ZN5doris10BitPacking12UnpackValuesIhLi64EEESt4pairIPKhlES4_llPT_
101
102
template <typename OutType>
103
std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(
104
        int bit_width, const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
105
        int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
106
        bool* __restrict__ decode_error) {
107
#pragma push_macro("UNPACK_VALUES_CASE")
108
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                                 \
109
    case i:                                                                                     \
110
        return UnpackAndDecodeValues<OutType, i>(in, in_bytes, dict, dict_len, num_values, out, \
111
                                                 stride, decode_error);
112
113
    switch (bit_width) {
114
        // Expand cases from 0 to MAX_DICT_BITWIDTH.
115
        BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
116
    default:
117
        DCHECK(false);
118
        return std::make_pair(nullptr, -1);
119
    }
120
#pragma pop_macro("UNPACK_VALUES_CASE")
121
}
122
template <typename OutType, int BIT_WIDTH>
123
std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(
124
        const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
125
        int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
126
        bool* __restrict__ decode_error) {
127
    constexpr int BATCH_SIZE = 32;
128
    const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
129
    const int64_t batches_to_read = values_to_read / BATCH_SIZE;
130
    const int64_t remainder_values = values_to_read % BATCH_SIZE;
131
    const uint8_t* in_pos = in;
132
    uint8_t* out_pos = reinterpret_cast<uint8_t*>(out);
133
    // First unpack as many full batches as possible.
134
    for (int64_t i = 0; i < batches_to_read; ++i) {
135
        in_pos = UnpackAndDecode32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, dict, dict_len,
136
                                                             reinterpret_cast<OutType*>(out_pos),
137
                                                             stride, decode_error);
138
        out_pos += stride * BATCH_SIZE;
139
        in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
140
    }
141
    // Then unpack the final partial batch.
142
    if (remainder_values > 0) {
143
        in_pos = UnpackAndDecodeUpTo31Values<OutType, BIT_WIDTH>(
144
                in_pos, in_bytes, dict, dict_len, remainder_values,
145
                reinterpret_cast<OutType*>(out_pos), stride, decode_error);
146
    }
147
    return std::make_pair(in_pos, values_to_read);
148
}
149
150
// Loop body of unrolled loop that unpacks the value. BIT_WIDTH is the bit width of
151
// the packed values. 'in_buf' is the start of the input buffer and 'out_vals' is the
152
// start of the output values array. This function unpacks the VALUE_IDX'th packed value
153
// from 'in_buf'.
154
//
155
// This implements essentially the same algorithm as the (Apache-licensed) code in
156
// bpacking.c at https://github.com/lemire/FrameOfReference/, but is much more compact
157
// because it uses templates rather than source-level unrolling of all combinations.
158
//
159
// After the template parameters are expanded and constants are propagated, all branches
160
// and offset/shift calculations should be optimized out, leaving only shifts by constants
161
// and bitmasks by constants. Calls to this must be stamped out manually or with
162
// BOOST_PP_REPEAT_FROM_TO: experimentation revealed that the GCC 4.9.2 optimiser was
163
// not able to fully propagate constants and remove branches when this was called from
164
// inside a for loop with constant bounds with VALUE_IDX changed to a function argument.
165
//
166
// We compute how many 32 bit words we have to read, which is either 1, 2 or 3. If it is
167
// at least 2, the first two 32 bit words are read as one 64 bit word. Even if only one
168
// word needs to be read, we try to read 64 bits if it does not lead to buffer overflow
169
// because benchmarks show that it has a positive effect on performance.
170
//
171
// If 'FULL_BATCH' is true, this function call is part of unpacking 32 values, otherwise
172
// up to 31 values. This is needed to optimise the length of the reads (32 or 64 bits) and
173
// avoid buffer overflow (if we are unpacking 32 values, we can safely assume an input
174
// buffer of length 32 * BIT_WIDTH).
175
template <int BIT_WIDTH, int VALUE_IDX, bool FULL_BATCH>
176
5.42M
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
5.42M
    if (BIT_WIDTH == 0) return 0;
178
179
5.42M
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
5.42M
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
5.42M
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
5.42M
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
5.42M
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
5.42M
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
5.42M
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
5.42M
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
5.42M
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
5.42M
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
5.42M
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
5.42M
    constexpr bool READ_32_BITS =
203
5.42M
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
5.42M
    if (READ_32_BITS) {
206
1.31M
        uint32_t word = in[FIRST_WORD_IDX];
207
1.31M
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.31M
        return word & mask;
209
1.31M
    }
210
211
4.11M
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.11M
    word >>= FIRST_BIT_OFFSET;
213
214
4.11M
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.11M
    return word & mask;
221
5.42M
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi0ELi0ELb0EEEmPKh
_ZN5doris11UnpackValueILi1ELi0ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi1ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi2ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi3ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi4ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi5ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi6ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi7ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi8ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi9ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi10ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi11ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi12ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi13ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi14ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi15ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi16ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi17ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi18ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi19ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi20ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi21ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi22ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi23ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi24ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi25ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi26ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi27ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi28ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi29ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi30ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
_ZN5doris11UnpackValueILi1ELi31ELb1EEEmPKh
Line
Count
Source
176
571
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
571
    if (BIT_WIDTH == 0) return 0;
178
179
571
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
571
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
571
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
571
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
571
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
571
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
571
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
571
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
571
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
571
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
571
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
571
    constexpr bool READ_32_BITS =
203
571
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
571
    if (READ_32_BITS) {
206
571
        uint32_t word = in[FIRST_WORD_IDX];
207
571
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
571
        return word & mask;
209
571
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
571
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi1ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi1ELi23ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi22ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi21ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi20ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi19ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi18ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi17ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi16ELb0EEEmPKh
Line
Count
Source
176
23
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
23
    if (BIT_WIDTH == 0) return 0;
178
179
23
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
23
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
23
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
23
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
23
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
23
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
23
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
23
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
23
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
23
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
23
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
23
    constexpr bool READ_32_BITS =
203
23
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
23
    if (READ_32_BITS) {
206
23
        uint32_t word = in[FIRST_WORD_IDX];
207
23
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
23
        return word & mask;
209
23
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
23
}
_ZN5doris11UnpackValueILi1ELi15ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi14ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi13ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi12ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi11ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi10ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi9ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi8ELb0EEEmPKh
Line
Count
Source
176
55
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
55
    if (BIT_WIDTH == 0) return 0;
178
179
55
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
55
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
55
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
55
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
55
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
55
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
55
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
55
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
55
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
55
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
55
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
55
    constexpr bool READ_32_BITS =
203
55
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
55
    if (READ_32_BITS) {
206
55
        uint32_t word = in[FIRST_WORD_IDX];
207
55
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
55
        return word & mask;
209
55
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
55
}
_ZN5doris11UnpackValueILi1ELi7ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi1ELi6ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi1ELi5ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi1ELi4ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi1ELi3ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi1ELi2ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi1ELi1ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi1ELi0ELb0EEEmPKh
Line
Count
Source
176
115
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
115
    if (BIT_WIDTH == 0) return 0;
178
179
115
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
115
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
115
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
115
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
115
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
115
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
115
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
115
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
115
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
115
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
115
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
115
    constexpr bool READ_32_BITS =
203
115
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
115
    if (READ_32_BITS) {
206
115
        uint32_t word = in[FIRST_WORD_IDX];
207
115
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
115
        return word & mask;
209
115
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
115
}
_ZN5doris11UnpackValueILi2ELi0ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi1ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi2ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi3ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi4ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi5ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi6ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi7ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi8ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi9ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi10ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi11ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi12ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi13ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi14ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi15ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi16ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi17ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi18ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi19ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi20ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi21ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi22ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi23ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi24ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi25ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi26ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi27ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi28ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi29ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi30ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
_ZN5doris11UnpackValueILi2ELi31ELb1EEEmPKh
Line
Count
Source
176
16
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
16
    if (BIT_WIDTH == 0) return 0;
178
179
16
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
16
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
16
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
16
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
16
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
16
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
16
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
16
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
16
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
16
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
16
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
16
    constexpr bool READ_32_BITS =
203
16
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
16
    if (READ_32_BITS) {
206
16
        uint32_t word = in[FIRST_WORD_IDX];
207
16
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
16
        return word & mask;
209
16
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
16
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi2ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi2ELi23ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi22ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi21ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi20ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi19ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi18ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi17ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi16ELb0EEEmPKh
Line
Count
Source
176
19
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
19
    if (BIT_WIDTH == 0) return 0;
178
179
19
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
19
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
19
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
19
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
19
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
19
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
19
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
19
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
19
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
19
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
19
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
19
    constexpr bool READ_32_BITS =
203
19
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
19
    if (READ_32_BITS) {
206
19
        uint32_t word = in[FIRST_WORD_IDX];
207
19
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
19
        return word & mask;
209
19
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
19
}
_ZN5doris11UnpackValueILi2ELi15ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi14ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi13ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi12ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi11ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi10ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi9ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi8ELb0EEEmPKh
Line
Count
Source
176
34
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
34
    if (BIT_WIDTH == 0) return 0;
178
179
34
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
34
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
34
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
34
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
34
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
34
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
34
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
34
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
34
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
34
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
34
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
34
    constexpr bool READ_32_BITS =
203
34
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
34
    if (READ_32_BITS) {
206
34
        uint32_t word = in[FIRST_WORD_IDX];
207
34
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
34
        return word & mask;
209
34
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
34
}
_ZN5doris11UnpackValueILi2ELi7ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi2ELi6ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi2ELi5ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi2ELi4ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi2ELi3ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi2ELi2ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi2ELi1ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi2ELi0ELb0EEEmPKh
Line
Count
Source
176
124
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
124
    if (BIT_WIDTH == 0) return 0;
178
179
124
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
124
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
124
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
124
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
124
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
124
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
124
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
124
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
124
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
124
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
124
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
124
    constexpr bool READ_32_BITS =
203
124
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
124
    if (READ_32_BITS) {
206
124
        uint32_t word = in[FIRST_WORD_IDX];
207
124
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
124
        return word & mask;
209
124
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
124
}
_ZN5doris11UnpackValueILi3ELi0ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi1ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi2ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi3ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi4ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi5ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi6ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi7ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi8ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi9ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi10ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi11ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi12ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi13ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi14ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi15ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi16ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi17ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi18ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi19ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi20ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi21ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.00k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.00k
    word >>= FIRST_BIT_OFFSET;
213
214
4.00k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.00k
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi22ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi23ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi24ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi25ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi26ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi27ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi28ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi29ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi30ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
_ZN5doris11UnpackValueILi3ELi31ELb1EEEmPKh
Line
Count
Source
176
4.00k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.00k
    if (BIT_WIDTH == 0) return 0;
178
179
4.00k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.00k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.00k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.00k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.00k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.00k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.00k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.00k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.00k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.00k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.00k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.00k
    constexpr bool READ_32_BITS =
203
4.00k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.00k
    if (READ_32_BITS) {
206
4.00k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.00k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.00k
        return word & mask;
209
4.00k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.00k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi3ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi3ELi23ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
278
        uint32_t word = in[FIRST_WORD_IDX];
207
278
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
278
        return word & mask;
209
278
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi22ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
278
        uint32_t word = in[FIRST_WORD_IDX];
207
278
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
278
        return word & mask;
209
278
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi21ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
278
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
278
    word >>= FIRST_BIT_OFFSET;
213
214
278
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
278
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi20ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
278
        uint32_t word = in[FIRST_WORD_IDX];
207
278
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
278
        return word & mask;
209
278
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi19ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
278
        uint32_t word = in[FIRST_WORD_IDX];
207
278
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
278
        return word & mask;
209
278
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi18ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
278
        uint32_t word = in[FIRST_WORD_IDX];
207
278
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
278
        return word & mask;
209
278
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi17ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
278
        uint32_t word = in[FIRST_WORD_IDX];
207
278
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
278
        return word & mask;
209
278
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi16ELb0EEEmPKh
Line
Count
Source
176
278
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
278
    if (BIT_WIDTH == 0) return 0;
178
179
278
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
278
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
278
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
278
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
278
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
278
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
278
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
278
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
278
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
278
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
278
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
278
    constexpr bool READ_32_BITS =
203
278
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
278
    if (READ_32_BITS) {
206
278
        uint32_t word = in[FIRST_WORD_IDX];
207
278
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
278
        return word & mask;
209
278
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
278
}
_ZN5doris11UnpackValueILi3ELi15ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
323
        uint32_t word = in[FIRST_WORD_IDX];
207
323
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
323
        return word & mask;
209
323
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi14ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
323
        uint32_t word = in[FIRST_WORD_IDX];
207
323
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
323
        return word & mask;
209
323
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi13ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
323
        uint32_t word = in[FIRST_WORD_IDX];
207
323
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
323
        return word & mask;
209
323
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi12ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
323
        uint32_t word = in[FIRST_WORD_IDX];
207
323
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
323
        return word & mask;
209
323
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi11ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
323
        uint32_t word = in[FIRST_WORD_IDX];
207
323
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
323
        return word & mask;
209
323
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi10ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
323
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
323
    word >>= FIRST_BIT_OFFSET;
213
214
323
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
323
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi9ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
323
        uint32_t word = in[FIRST_WORD_IDX];
207
323
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
323
        return word & mask;
209
323
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi8ELb0EEEmPKh
Line
Count
Source
176
323
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
323
    if (BIT_WIDTH == 0) return 0;
178
179
323
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
323
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
323
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
323
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
323
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
323
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
323
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
323
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
323
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
323
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
323
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
323
    constexpr bool READ_32_BITS =
203
323
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
323
    if (READ_32_BITS) {
206
323
        uint32_t word = in[FIRST_WORD_IDX];
207
323
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
323
        return word & mask;
209
323
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
323
}
_ZN5doris11UnpackValueILi3ELi7ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi3ELi6ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi3ELi5ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi3ELi4ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi3ELi3ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi3ELi2ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi3ELi1ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi3ELi0ELb0EEEmPKh
Line
Count
Source
176
465
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
465
    if (BIT_WIDTH == 0) return 0;
178
179
465
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
465
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
465
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
465
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
465
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
465
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
465
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
465
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
465
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
465
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
465
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
465
    constexpr bool READ_32_BITS =
203
465
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
465
    if (READ_32_BITS) {
206
465
        uint32_t word = in[FIRST_WORD_IDX];
207
465
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
465
        return word & mask;
209
465
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
465
}
_ZN5doris11UnpackValueILi4ELi0ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi1ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi2ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi3ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi4ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi5ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi6ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi7ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi8ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi9ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi10ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi11ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi12ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi13ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi14ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi15ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi16ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi17ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi18ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi19ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi20ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi21ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi22ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi23ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi24ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi25ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi26ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi27ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi28ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi29ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi30ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
_ZN5doris11UnpackValueILi4ELi31ELb1EEEmPKh
Line
Count
Source
176
378
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
378
    if (BIT_WIDTH == 0) return 0;
178
179
378
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
378
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
378
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
378
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
378
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
378
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
378
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
378
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
378
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
378
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
378
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
378
    constexpr bool READ_32_BITS =
203
378
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
378
    if (READ_32_BITS) {
206
378
        uint32_t word = in[FIRST_WORD_IDX];
207
378
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
378
        return word & mask;
209
378
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
378
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi4ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi4ELi23ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi22ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi21ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi20ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi19ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi18ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi17ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi16ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi4ELi15ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi14ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi13ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi12ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi11ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi10ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi9ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi8ELb0EEEmPKh
Line
Count
Source
176
175
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
175
    if (BIT_WIDTH == 0) return 0;
178
179
175
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
175
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
175
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
175
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
175
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
175
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
175
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
175
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
175
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
175
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
175
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
175
    constexpr bool READ_32_BITS =
203
175
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
175
    if (READ_32_BITS) {
206
175
        uint32_t word = in[FIRST_WORD_IDX];
207
175
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
175
        return word & mask;
209
175
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
175
}
_ZN5doris11UnpackValueILi4ELi7ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi4ELi6ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi4ELi5ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi4ELi4ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi4ELi3ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi4ELi2ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi4ELi1ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi4ELi0ELb0EEEmPKh
Line
Count
Source
176
194
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
194
    if (BIT_WIDTH == 0) return 0;
178
179
194
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
194
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
194
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
194
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
194
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
194
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
194
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
194
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
194
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
194
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
194
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
194
    constexpr bool READ_32_BITS =
203
194
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
194
    if (READ_32_BITS) {
206
194
        uint32_t word = in[FIRST_WORD_IDX];
207
194
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
194
        return word & mask;
209
194
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
194
}
_ZN5doris11UnpackValueILi5ELi0ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi1ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi2ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi3ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi4ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi5ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi6ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi7ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi8ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi9ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi10ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi11ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi12ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi13ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi14ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi15ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi16ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi17ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi18ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi19ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi20ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi21ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi22ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi23ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi24ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi25ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
3.01k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
3.01k
    word >>= FIRST_BIT_OFFSET;
213
214
3.01k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
3.01k
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi26ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
3.01k
        uint32_t word = in[FIRST_WORD_IDX];
207
3.01k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
3.01k
        return word & mask;
209
3.01k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi27ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
3.01k
        uint32_t word = in[FIRST_WORD_IDX];
207
3.01k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
3.01k
        return word & mask;
209
3.01k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi28ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
3.01k
        uint32_t word = in[FIRST_WORD_IDX];
207
3.01k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
3.01k
        return word & mask;
209
3.01k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi29ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
3.01k
        uint32_t word = in[FIRST_WORD_IDX];
207
3.01k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
3.01k
        return word & mask;
209
3.01k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi30ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
3.01k
        uint32_t word = in[FIRST_WORD_IDX];
207
3.01k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
3.01k
        return word & mask;
209
3.01k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
3.01k
}
_ZN5doris11UnpackValueILi5ELi31ELb1EEEmPKh
Line
Count
Source
176
3.01k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
3.01k
    if (BIT_WIDTH == 0) return 0;
178
179
3.01k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
3.01k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
3.01k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
3.01k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
3.01k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
3.01k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
3.01k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
3.01k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
3.01k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
3.01k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
3.01k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
3.01k
    constexpr bool READ_32_BITS =
203
3.01k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
3.01k
    if (READ_32_BITS) {
206
3.01k
        uint32_t word = in[FIRST_WORD_IDX];
207
3.01k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
3.01k
        return word & mask;
209
3.01k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
3.01k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi5ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi5ELi23ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
239
        uint32_t word = in[FIRST_WORD_IDX];
207
239
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
239
        return word & mask;
209
239
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi22ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
239
        uint32_t word = in[FIRST_WORD_IDX];
207
239
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
239
        return word & mask;
209
239
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi21ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
239
        uint32_t word = in[FIRST_WORD_IDX];
207
239
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
239
        return word & mask;
209
239
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi20ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
239
        uint32_t word = in[FIRST_WORD_IDX];
207
239
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
239
        return word & mask;
209
239
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi19ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
239
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
239
    word >>= FIRST_BIT_OFFSET;
213
214
239
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
239
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi18ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
239
        uint32_t word = in[FIRST_WORD_IDX];
207
239
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
239
        return word & mask;
209
239
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi17ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
239
        uint32_t word = in[FIRST_WORD_IDX];
207
239
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
239
        return word & mask;
209
239
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi16ELb0EEEmPKh
Line
Count
Source
176
239
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
239
    if (BIT_WIDTH == 0) return 0;
178
179
239
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
239
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
239
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
239
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
239
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
239
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
239
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
239
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
239
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
239
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
239
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
239
    constexpr bool READ_32_BITS =
203
239
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
239
    if (READ_32_BITS) {
206
239
        uint32_t word = in[FIRST_WORD_IDX];
207
239
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
239
        return word & mask;
209
239
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
239
}
_ZN5doris11UnpackValueILi5ELi15ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
251
        uint32_t word = in[FIRST_WORD_IDX];
207
251
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
251
        return word & mask;
209
251
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi14ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
251
        uint32_t word = in[FIRST_WORD_IDX];
207
251
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
251
        return word & mask;
209
251
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi13ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
251
        uint32_t word = in[FIRST_WORD_IDX];
207
251
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
251
        return word & mask;
209
251
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi12ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
251
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
251
    word >>= FIRST_BIT_OFFSET;
213
214
251
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
251
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi11ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
251
        uint32_t word = in[FIRST_WORD_IDX];
207
251
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
251
        return word & mask;
209
251
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi10ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
251
        uint32_t word = in[FIRST_WORD_IDX];
207
251
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
251
        return word & mask;
209
251
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi9ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
251
        uint32_t word = in[FIRST_WORD_IDX];
207
251
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
251
        return word & mask;
209
251
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi8ELb0EEEmPKh
Line
Count
Source
176
251
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
251
    if (BIT_WIDTH == 0) return 0;
178
179
251
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
251
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
251
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
251
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
251
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
251
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
251
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
251
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
251
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
251
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
251
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
251
    constexpr bool READ_32_BITS =
203
251
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
251
    if (READ_32_BITS) {
206
251
        uint32_t word = in[FIRST_WORD_IDX];
207
251
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
251
        return word & mask;
209
251
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
251
}
_ZN5doris11UnpackValueILi5ELi7ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
276
        uint32_t word = in[FIRST_WORD_IDX];
207
276
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
276
        return word & mask;
209
276
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi5ELi6ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
276
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
276
    word >>= FIRST_BIT_OFFSET;
213
214
276
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
276
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi5ELi5ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
276
        uint32_t word = in[FIRST_WORD_IDX];
207
276
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
276
        return word & mask;
209
276
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi5ELi4ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
276
        uint32_t word = in[FIRST_WORD_IDX];
207
276
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
276
        return word & mask;
209
276
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi5ELi3ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
276
        uint32_t word = in[FIRST_WORD_IDX];
207
276
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
276
        return word & mask;
209
276
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi5ELi2ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
276
        uint32_t word = in[FIRST_WORD_IDX];
207
276
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
276
        return word & mask;
209
276
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi5ELi1ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
276
        uint32_t word = in[FIRST_WORD_IDX];
207
276
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
276
        return word & mask;
209
276
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi5ELi0ELb0EEEmPKh
Line
Count
Source
176
276
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
276
    if (BIT_WIDTH == 0) return 0;
178
179
276
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
276
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
276
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
276
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
276
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
276
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
276
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
276
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
276
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
276
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
276
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
276
    constexpr bool READ_32_BITS =
203
276
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
276
    if (READ_32_BITS) {
206
276
        uint32_t word = in[FIRST_WORD_IDX];
207
276
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
276
        return word & mask;
209
276
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
276
}
_ZN5doris11UnpackValueILi6ELi0ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi1ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi2ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi3ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi4ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi5ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi6ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi7ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi8ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi9ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi10ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi11ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi12ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi13ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi14ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi15ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi16ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi17ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi18ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi19ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi20ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi21ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi22ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi23ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi24ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi25ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi26ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
586
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
586
    word >>= FIRST_BIT_OFFSET;
213
214
586
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
586
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi27ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
586
        uint32_t word = in[FIRST_WORD_IDX];
207
586
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
586
        return word & mask;
209
586
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi28ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
586
        uint32_t word = in[FIRST_WORD_IDX];
207
586
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
586
        return word & mask;
209
586
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi29ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
586
        uint32_t word = in[FIRST_WORD_IDX];
207
586
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
586
        return word & mask;
209
586
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi30ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
586
        uint32_t word = in[FIRST_WORD_IDX];
207
586
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
586
        return word & mask;
209
586
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
586
}
_ZN5doris11UnpackValueILi6ELi31ELb1EEEmPKh
Line
Count
Source
176
586
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
586
    if (BIT_WIDTH == 0) return 0;
178
179
586
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
586
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
586
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
586
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
586
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
586
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
586
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
586
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
586
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
586
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
586
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
586
    constexpr bool READ_32_BITS =
203
586
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
586
    if (READ_32_BITS) {
206
586
        uint32_t word = in[FIRST_WORD_IDX];
207
586
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
586
        return word & mask;
209
586
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
586
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi6ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi6ELi23ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi22ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi21ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
40
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
40
    word >>= FIRST_BIT_OFFSET;
213
214
40
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
40
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi20ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi19ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi18ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi17ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi16ELb0EEEmPKh
Line
Count
Source
176
40
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
40
    if (BIT_WIDTH == 0) return 0;
178
179
40
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
40
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
40
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
40
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
40
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
40
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
40
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
40
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
40
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
40
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
40
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
40
    constexpr bool READ_32_BITS =
203
40
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
40
    if (READ_32_BITS) {
206
40
        uint32_t word = in[FIRST_WORD_IDX];
207
40
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
40
        return word & mask;
209
40
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
40
}
_ZN5doris11UnpackValueILi6ELi15ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
43
        uint32_t word = in[FIRST_WORD_IDX];
207
43
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
43
        return word & mask;
209
43
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi14ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
43
        uint32_t word = in[FIRST_WORD_IDX];
207
43
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
43
        return word & mask;
209
43
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi13ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
43
        uint32_t word = in[FIRST_WORD_IDX];
207
43
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
43
        return word & mask;
209
43
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi12ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
43
        uint32_t word = in[FIRST_WORD_IDX];
207
43
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
43
        return word & mask;
209
43
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi11ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
43
        uint32_t word = in[FIRST_WORD_IDX];
207
43
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
43
        return word & mask;
209
43
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi10ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
43
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
43
    word >>= FIRST_BIT_OFFSET;
213
214
43
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
43
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi9ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
43
        uint32_t word = in[FIRST_WORD_IDX];
207
43
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
43
        return word & mask;
209
43
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi8ELb0EEEmPKh
Line
Count
Source
176
43
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
43
    if (BIT_WIDTH == 0) return 0;
178
179
43
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
43
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
43
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
43
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
43
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
43
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
43
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
43
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
43
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
43
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
43
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
43
    constexpr bool READ_32_BITS =
203
43
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
43
    if (READ_32_BITS) {
206
43
        uint32_t word = in[FIRST_WORD_IDX];
207
43
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
43
        return word & mask;
209
43
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
43
}
_ZN5doris11UnpackValueILi6ELi7ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
76
        uint32_t word = in[FIRST_WORD_IDX];
207
76
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
76
        return word & mask;
209
76
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi6ELi6ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
76
        uint32_t word = in[FIRST_WORD_IDX];
207
76
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
76
        return word & mask;
209
76
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi6ELi5ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
76
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
76
    word >>= FIRST_BIT_OFFSET;
213
214
76
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
76
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi6ELi4ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
76
        uint32_t word = in[FIRST_WORD_IDX];
207
76
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
76
        return word & mask;
209
76
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi6ELi3ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
76
        uint32_t word = in[FIRST_WORD_IDX];
207
76
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
76
        return word & mask;
209
76
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi6ELi2ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
76
        uint32_t word = in[FIRST_WORD_IDX];
207
76
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
76
        return word & mask;
209
76
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi6ELi1ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
76
        uint32_t word = in[FIRST_WORD_IDX];
207
76
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
76
        return word & mask;
209
76
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi6ELi0ELb0EEEmPKh
Line
Count
Source
176
76
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
76
    if (BIT_WIDTH == 0) return 0;
178
179
76
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
76
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
76
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
76
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
76
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
76
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
76
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
76
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
76
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
76
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
76
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
76
    constexpr bool READ_32_BITS =
203
76
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
76
    if (READ_32_BITS) {
206
76
        uint32_t word = in[FIRST_WORD_IDX];
207
76
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
76
        return word & mask;
209
76
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
76
}
_ZN5doris11UnpackValueILi7ELi0ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi1ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi2ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi3ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi4ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi5ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi6ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi7ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi8ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi9ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi10ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi11ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi12ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi13ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi14ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi15ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi16ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi17ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi18ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi19ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi20ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi21ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi22ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi23ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi24ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi25ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi26ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi27ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
123
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
123
    word >>= FIRST_BIT_OFFSET;
213
214
123
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
123
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi28ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
123
        uint32_t word = in[FIRST_WORD_IDX];
207
123
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
123
        return word & mask;
209
123
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi29ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
123
        uint32_t word = in[FIRST_WORD_IDX];
207
123
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
123
        return word & mask;
209
123
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi30ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
123
        uint32_t word = in[FIRST_WORD_IDX];
207
123
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
123
        return word & mask;
209
123
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
123
}
_ZN5doris11UnpackValueILi7ELi31ELb1EEEmPKh
Line
Count
Source
176
123
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
123
    if (BIT_WIDTH == 0) return 0;
178
179
123
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
123
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
123
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
123
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
123
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
123
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
123
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
123
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
123
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
123
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
123
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
123
    constexpr bool READ_32_BITS =
203
123
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
123
    if (READ_32_BITS) {
206
123
        uint32_t word = in[FIRST_WORD_IDX];
207
123
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
123
        return word & mask;
209
123
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
123
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi7ELi8ELb0EEEmPKh
_ZN5doris11UnpackValueILi7ELi7ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
41
        uint32_t word = in[FIRST_WORD_IDX];
207
41
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
41
        return word & mask;
209
41
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi7ELi6ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
41
        uint32_t word = in[FIRST_WORD_IDX];
207
41
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
41
        return word & mask;
209
41
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi7ELi5ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
41
        uint32_t word = in[FIRST_WORD_IDX];
207
41
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
41
        return word & mask;
209
41
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi7ELi4ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
41
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
41
    word >>= FIRST_BIT_OFFSET;
213
214
41
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
41
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi7ELi3ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
41
        uint32_t word = in[FIRST_WORD_IDX];
207
41
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
41
        return word & mask;
209
41
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi7ELi2ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
41
        uint32_t word = in[FIRST_WORD_IDX];
207
41
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
41
        return word & mask;
209
41
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi7ELi1ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
41
        uint32_t word = in[FIRST_WORD_IDX];
207
41
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
41
        return word & mask;
209
41
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi7ELi0ELb0EEEmPKh
Line
Count
Source
176
41
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
41
    if (BIT_WIDTH == 0) return 0;
178
179
41
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
41
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
41
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
41
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
41
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
41
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
41
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
41
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
41
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
41
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
41
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
41
    constexpr bool READ_32_BITS =
203
41
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
41
    if (READ_32_BITS) {
206
41
        uint32_t word = in[FIRST_WORD_IDX];
207
41
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
41
        return word & mask;
209
41
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
41
}
_ZN5doris11UnpackValueILi8ELi0ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi1ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi2ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi3ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi4ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi5ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi6ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi7ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi8ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi9ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi10ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi11ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi12ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi13ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi14ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi15ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi16ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi17ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi18ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi19ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi20ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi21ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi22ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi23ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi24ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi25ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi26ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi27ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi28ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi29ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi30ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
_ZN5doris11UnpackValueILi8ELi31ELb1EEEmPKh
Line
Count
Source
176
18.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
18.7k
    if (BIT_WIDTH == 0) return 0;
178
179
18.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
18.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
18.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
18.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
18.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
18.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
18.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
18.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
18.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
18.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
18.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
18.7k
    constexpr bool READ_32_BITS =
203
18.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
18.7k
    if (READ_32_BITS) {
206
18.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
18.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
18.7k
        return word & mask;
209
18.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
18.7k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi8ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi8ELi23ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi22ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi21ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi20ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi19ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi18ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi17ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi16ELb0EEEmPKh
Line
Count
Source
176
1.24k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.24k
    if (BIT_WIDTH == 0) return 0;
178
179
1.24k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.24k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.24k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.24k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.24k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.24k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.24k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.24k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.24k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.24k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.24k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.24k
    constexpr bool READ_32_BITS =
203
1.24k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.24k
    if (READ_32_BITS) {
206
1.24k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.24k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.24k
        return word & mask;
209
1.24k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.24k
}
_ZN5doris11UnpackValueILi8ELi15ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi14ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi13ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi12ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi11ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi10ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi9ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi8ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi7ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi6ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi5ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi4ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi3ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi2ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi1ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi8ELi0ELb0EEEmPKh
Line
Count
Source
176
1.25k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.25k
    if (BIT_WIDTH == 0) return 0;
178
179
1.25k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.25k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.25k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.25k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.25k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.25k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.25k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.25k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.25k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.25k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.25k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.25k
    constexpr bool READ_32_BITS =
203
1.25k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.25k
    if (READ_32_BITS) {
206
1.25k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.25k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.25k
        return word & mask;
209
1.25k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.25k
}
_ZN5doris11UnpackValueILi9ELi0ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi1ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi2ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi3ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi4ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi5ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi6ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi7ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi8ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi9ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi10ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi11ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi12ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi13ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi14ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi15ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi16ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi17ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi18ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi19ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi20ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi21ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi22ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi23ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi24ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi25ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi26ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi27ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi28ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.83k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.83k
    word >>= FIRST_BIT_OFFSET;
213
214
4.83k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.83k
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi29ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
4.83k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.83k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.83k
        return word & mask;
209
4.83k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi30ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
4.83k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.83k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.83k
        return word & mask;
209
4.83k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.83k
}
_ZN5doris11UnpackValueILi9ELi31ELb1EEEmPKh
Line
Count
Source
176
4.83k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.83k
    if (BIT_WIDTH == 0) return 0;
178
179
4.83k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.83k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.83k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.83k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.83k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.83k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.83k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.83k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.83k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.83k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.83k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.83k
    constexpr bool READ_32_BITS =
203
4.83k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.83k
    if (READ_32_BITS) {
206
4.83k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.83k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.83k
        return word & mask;
209
4.83k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.83k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi9ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi9ELi23ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi22ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi21ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
314
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
314
    word >>= FIRST_BIT_OFFSET;
213
214
314
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
314
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi20ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi19ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi18ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi17ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
314
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
314
    word >>= FIRST_BIT_OFFSET;
213
214
314
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
314
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi16ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi15ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi14ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
314
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
314
    word >>= FIRST_BIT_OFFSET;
213
214
314
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
314
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi13ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi12ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi11ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi10ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
314
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
314
    word >>= FIRST_BIT_OFFSET;
213
214
314
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
314
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi9ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi8ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi7ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
314
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
314
    word >>= FIRST_BIT_OFFSET;
213
214
314
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
314
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi6ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi5ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi4ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi3ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
314
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
314
    word >>= FIRST_BIT_OFFSET;
213
214
314
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
314
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi2ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi1ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi9ELi0ELb0EEEmPKh
Line
Count
Source
176
314
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
314
    if (BIT_WIDTH == 0) return 0;
178
179
314
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
314
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
314
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
314
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
314
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
314
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
314
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
314
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
314
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
314
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
314
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
314
    constexpr bool READ_32_BITS =
203
314
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
314
    if (READ_32_BITS) {
206
314
        uint32_t word = in[FIRST_WORD_IDX];
207
314
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
314
        return word & mask;
209
314
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
314
}
_ZN5doris11UnpackValueILi10ELi0ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi1ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi2ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi3ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi4ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi5ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi6ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi7ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi8ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi9ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi10ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi11ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi12ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi13ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi14ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi15ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi16ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi17ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi18ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi19ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi20ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi21ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi22ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi23ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi24ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi25ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi26ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi27ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi28ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
13.2k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
13.2k
    word >>= FIRST_BIT_OFFSET;
213
214
13.2k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
13.2k
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi29ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
13.2k
        uint32_t word = in[FIRST_WORD_IDX];
207
13.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
13.2k
        return word & mask;
209
13.2k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi30ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
13.2k
        uint32_t word = in[FIRST_WORD_IDX];
207
13.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
13.2k
        return word & mask;
209
13.2k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
13.2k
}
_ZN5doris11UnpackValueILi10ELi31ELb1EEEmPKh
Line
Count
Source
176
13.2k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
13.2k
    if (BIT_WIDTH == 0) return 0;
178
179
13.2k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
13.2k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
13.2k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
13.2k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
13.2k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
13.2k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
13.2k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
13.2k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
13.2k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
13.2k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
13.2k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
13.2k
    constexpr bool READ_32_BITS =
203
13.2k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
13.2k
    if (READ_32_BITS) {
206
13.2k
        uint32_t word = in[FIRST_WORD_IDX];
207
13.2k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
13.2k
        return word & mask;
209
13.2k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
13.2k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi10ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi10ELi23ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
875
        uint32_t word = in[FIRST_WORD_IDX];
207
875
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
875
        return word & mask;
209
875
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi22ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
875
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
875
    word >>= FIRST_BIT_OFFSET;
213
214
875
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
875
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi21ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
875
        uint32_t word = in[FIRST_WORD_IDX];
207
875
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
875
        return word & mask;
209
875
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi20ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
875
        uint32_t word = in[FIRST_WORD_IDX];
207
875
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
875
        return word & mask;
209
875
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi19ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
875
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
875
    word >>= FIRST_BIT_OFFSET;
213
214
875
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
875
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi18ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
875
        uint32_t word = in[FIRST_WORD_IDX];
207
875
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
875
        return word & mask;
209
875
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi17ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
875
        uint32_t word = in[FIRST_WORD_IDX];
207
875
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
875
        return word & mask;
209
875
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi16ELb0EEEmPKh
Line
Count
Source
176
875
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
875
    if (BIT_WIDTH == 0) return 0;
178
179
875
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
875
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
875
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
875
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
875
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
875
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
875
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
875
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
875
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
875
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
875
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
875
    constexpr bool READ_32_BITS =
203
875
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
875
    if (READ_32_BITS) {
206
875
        uint32_t word = in[FIRST_WORD_IDX];
207
875
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
875
        return word & mask;
209
875
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
875
}
_ZN5doris11UnpackValueILi10ELi15ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
882
        uint32_t word = in[FIRST_WORD_IDX];
207
882
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
882
        return word & mask;
209
882
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi14ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
882
        uint32_t word = in[FIRST_WORD_IDX];
207
882
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
882
        return word & mask;
209
882
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi13ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
882
        uint32_t word = in[FIRST_WORD_IDX];
207
882
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
882
        return word & mask;
209
882
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi12ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
882
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
882
    word >>= FIRST_BIT_OFFSET;
213
214
882
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
882
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi11ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
882
        uint32_t word = in[FIRST_WORD_IDX];
207
882
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
882
        return word & mask;
209
882
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi10ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
882
        uint32_t word = in[FIRST_WORD_IDX];
207
882
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
882
        return word & mask;
209
882
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi9ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
882
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
882
    word >>= FIRST_BIT_OFFSET;
213
214
882
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
882
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi8ELb0EEEmPKh
Line
Count
Source
176
882
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
882
    if (BIT_WIDTH == 0) return 0;
178
179
882
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
882
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
882
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
882
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
882
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
882
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
882
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
882
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
882
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
882
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
882
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
882
    constexpr bool READ_32_BITS =
203
882
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
882
    if (READ_32_BITS) {
206
882
        uint32_t word = in[FIRST_WORD_IDX];
207
882
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
882
        return word & mask;
209
882
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
882
}
_ZN5doris11UnpackValueILi10ELi7ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
884
        uint32_t word = in[FIRST_WORD_IDX];
207
884
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
884
        return word & mask;
209
884
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi10ELi6ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
884
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
884
    word >>= FIRST_BIT_OFFSET;
213
214
884
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
884
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi10ELi5ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
884
        uint32_t word = in[FIRST_WORD_IDX];
207
884
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
884
        return word & mask;
209
884
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi10ELi4ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
884
        uint32_t word = in[FIRST_WORD_IDX];
207
884
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
884
        return word & mask;
209
884
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi10ELi3ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
884
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
884
    word >>= FIRST_BIT_OFFSET;
213
214
884
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
884
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi10ELi2ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
884
        uint32_t word = in[FIRST_WORD_IDX];
207
884
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
884
        return word & mask;
209
884
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi10ELi1ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
884
        uint32_t word = in[FIRST_WORD_IDX];
207
884
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
884
        return word & mask;
209
884
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi10ELi0ELb0EEEmPKh
Line
Count
Source
176
884
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
884
    if (BIT_WIDTH == 0) return 0;
178
179
884
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
884
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
884
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
884
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
884
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
884
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
884
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
884
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
884
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
884
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
884
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
884
    constexpr bool READ_32_BITS =
203
884
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
884
    if (READ_32_BITS) {
206
884
        uint32_t word = in[FIRST_WORD_IDX];
207
884
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
884
        return word & mask;
209
884
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
884
}
_ZN5doris11UnpackValueILi11ELi0ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi1ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi2ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi3ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi4ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi5ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi6ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi7ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi8ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi9ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi10ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi11ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi12ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi13ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi14ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi15ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi16ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi17ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi18ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi19ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi20ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi21ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi22ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi23ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi24ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi25ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi26ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi27ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi28ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi29ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
38.1k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
38.1k
    word >>= FIRST_BIT_OFFSET;
213
214
38.1k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
38.1k
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi30ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
38.1k
        uint32_t word = in[FIRST_WORD_IDX];
207
38.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
38.1k
        return word & mask;
209
38.1k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
38.1k
}
_ZN5doris11UnpackValueILi11ELi31ELb1EEEmPKh
Line
Count
Source
176
38.1k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
38.1k
    if (BIT_WIDTH == 0) return 0;
178
179
38.1k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
38.1k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
38.1k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
38.1k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
38.1k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
38.1k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
38.1k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
38.1k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
38.1k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
38.1k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
38.1k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
38.1k
    constexpr bool READ_32_BITS =
203
38.1k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
38.1k
    if (READ_32_BITS) {
206
38.1k
        uint32_t word = in[FIRST_WORD_IDX];
207
38.1k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
38.1k
        return word & mask;
209
38.1k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
38.1k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi11ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi11ELi23ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.50k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.50k
    word >>= FIRST_BIT_OFFSET;
213
214
2.50k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.50k
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi22ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
2.50k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.50k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.50k
        return word & mask;
209
2.50k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi21ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
2.50k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.50k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.50k
        return word & mask;
209
2.50k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi20ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.50k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.50k
    word >>= FIRST_BIT_OFFSET;
213
214
2.50k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.50k
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi19ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
2.50k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.50k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.50k
        return word & mask;
209
2.50k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi18ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
2.50k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.50k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.50k
        return word & mask;
209
2.50k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi17ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.50k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.50k
    word >>= FIRST_BIT_OFFSET;
213
214
2.50k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.50k
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi16ELb0EEEmPKh
Line
Count
Source
176
2.50k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.50k
    if (BIT_WIDTH == 0) return 0;
178
179
2.50k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.50k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.50k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.50k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.50k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.50k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.50k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.50k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.50k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.50k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.50k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.50k
    constexpr bool READ_32_BITS =
203
2.50k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.50k
    if (READ_32_BITS) {
206
2.50k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.50k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.50k
        return word & mask;
209
2.50k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.50k
}
_ZN5doris11UnpackValueILi11ELi15ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi14ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.51k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.51k
    word >>= FIRST_BIT_OFFSET;
213
214
2.51k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.51k
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi13ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi12ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi11ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.51k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.51k
    word >>= FIRST_BIT_OFFSET;
213
214
2.51k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.51k
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi10ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi9ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi8ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.51k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.51k
    word >>= FIRST_BIT_OFFSET;
213
214
2.51k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.51k
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi7ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi6ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi5ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.51k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.51k
    word >>= FIRST_BIT_OFFSET;
213
214
2.51k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.51k
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi4ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi3ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi2ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
2.51k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
2.51k
    word >>= FIRST_BIT_OFFSET;
213
214
2.51k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
2.51k
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi1ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi11ELi0ELb0EEEmPKh
Line
Count
Source
176
2.51k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
2.51k
    if (BIT_WIDTH == 0) return 0;
178
179
2.51k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
2.51k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
2.51k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
2.51k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
2.51k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
2.51k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
2.51k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
2.51k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
2.51k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
2.51k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
2.51k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
2.51k
    constexpr bool READ_32_BITS =
203
2.51k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
2.51k
    if (READ_32_BITS) {
206
2.51k
        uint32_t word = in[FIRST_WORD_IDX];
207
2.51k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
2.51k
        return word & mask;
209
2.51k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
2.51k
}
_ZN5doris11UnpackValueILi12ELi0ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi1ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi2ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi3ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi4ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi5ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi6ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi7ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi8ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi9ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi10ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi11ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi12ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi13ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi14ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi15ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi16ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi17ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi18ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi19ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi20ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi21ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi22ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi23ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi24ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi25ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi26ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi27ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi28ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi29ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
71.7k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
71.7k
    word >>= FIRST_BIT_OFFSET;
213
214
71.7k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
71.7k
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi30ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
71.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
71.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
71.7k
        return word & mask;
209
71.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
71.7k
}
_ZN5doris11UnpackValueILi12ELi31ELb1EEEmPKh
Line
Count
Source
176
71.7k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
71.7k
    if (BIT_WIDTH == 0) return 0;
178
179
71.7k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
71.7k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
71.7k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
71.7k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
71.7k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
71.7k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
71.7k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
71.7k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
71.7k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
71.7k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
71.7k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
71.7k
    constexpr bool READ_32_BITS =
203
71.7k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
71.7k
    if (READ_32_BITS) {
206
71.7k
        uint32_t word = in[FIRST_WORD_IDX];
207
71.7k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
71.7k
        return word & mask;
209
71.7k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
71.7k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi12ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi12ELi23ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
4.76k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.76k
        return word & mask;
209
4.76k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi22ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
4.76k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.76k
        return word & mask;
209
4.76k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi21ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.76k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.76k
    word >>= FIRST_BIT_OFFSET;
213
214
4.76k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.76k
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi20ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
4.76k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.76k
        return word & mask;
209
4.76k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi19ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
4.76k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.76k
        return word & mask;
209
4.76k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi18ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.76k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.76k
    word >>= FIRST_BIT_OFFSET;
213
214
4.76k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.76k
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi17ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
4.76k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.76k
        return word & mask;
209
4.76k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi16ELb0EEEmPKh
Line
Count
Source
176
4.76k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.76k
    if (BIT_WIDTH == 0) return 0;
178
179
4.76k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.76k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.76k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.76k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.76k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.76k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.76k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.76k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.76k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.76k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.76k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.76k
    constexpr bool READ_32_BITS =
203
4.76k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.76k
    if (READ_32_BITS) {
206
4.76k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.76k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.76k
        return word & mask;
209
4.76k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.76k
}
_ZN5doris11UnpackValueILi12ELi15ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
4.77k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.77k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.77k
        return word & mask;
209
4.77k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi14ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
4.77k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.77k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.77k
        return word & mask;
209
4.77k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi13ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.77k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.77k
    word >>= FIRST_BIT_OFFSET;
213
214
4.77k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.77k
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi12ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
4.77k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.77k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.77k
        return word & mask;
209
4.77k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi11ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
4.77k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.77k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.77k
        return word & mask;
209
4.77k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi10ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.77k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.77k
    word >>= FIRST_BIT_OFFSET;
213
214
4.77k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.77k
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi9ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
4.77k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.77k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.77k
        return word & mask;
209
4.77k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi8ELb0EEEmPKh
Line
Count
Source
176
4.77k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.77k
    if (BIT_WIDTH == 0) return 0;
178
179
4.77k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.77k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.77k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.77k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.77k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.77k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.77k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.77k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.77k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.77k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.77k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.77k
    constexpr bool READ_32_BITS =
203
4.77k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.77k
    if (READ_32_BITS) {
206
4.77k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.77k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.77k
        return word & mask;
209
4.77k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.77k
}
_ZN5doris11UnpackValueILi12ELi7ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
4.78k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.78k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.78k
        return word & mask;
209
4.78k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.78k
}
_ZN5doris11UnpackValueILi12ELi6ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
4.78k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.78k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.78k
        return word & mask;
209
4.78k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.78k
}
_ZN5doris11UnpackValueILi12ELi5ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.78k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.78k
    word >>= FIRST_BIT_OFFSET;
213
214
4.78k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.78k
    return word & mask;
221
4.78k
}
_ZN5doris11UnpackValueILi12ELi4ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
4.78k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.78k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.78k
        return word & mask;
209
4.78k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.78k
}
_ZN5doris11UnpackValueILi12ELi3ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
4.78k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.78k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.78k
        return word & mask;
209
4.78k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.78k
}
_ZN5doris11UnpackValueILi12ELi2ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
4.78k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
4.78k
    word >>= FIRST_BIT_OFFSET;
213
214
4.78k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
4.78k
    return word & mask;
221
4.78k
}
_ZN5doris11UnpackValueILi12ELi1ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
4.78k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.78k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.78k
        return word & mask;
209
4.78k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.78k
}
_ZN5doris11UnpackValueILi12ELi0ELb0EEEmPKh
Line
Count
Source
176
4.78k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.78k
    if (BIT_WIDTH == 0) return 0;
178
179
4.78k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.78k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.78k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.78k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.78k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.78k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.78k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.78k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.78k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.78k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.78k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.78k
    constexpr bool READ_32_BITS =
203
4.78k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.78k
    if (READ_32_BITS) {
206
4.78k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.78k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.78k
        return word & mask;
209
4.78k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.78k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi13ELi0ELb0EEEmPKh
_ZN5doris11UnpackValueILi14ELi0ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi1ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi2ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi3ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi4ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi5ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi6ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi7ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi8ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi9ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi10ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi11ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi12ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi13ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi14ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi15ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi16ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi17ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi18ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi19ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi20ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi21ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi22ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi23ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi24ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi25ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi26ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi27ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi28ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi29ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
1.48k
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
1.48k
    word >>= FIRST_BIT_OFFSET;
213
214
1.48k
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
1.48k
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi30ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
1.48k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.48k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.48k
        return word & mask;
209
1.48k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.48k
}
_ZN5doris11UnpackValueILi14ELi31ELb1EEEmPKh
Line
Count
Source
176
1.48k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
1.48k
    if (BIT_WIDTH == 0) return 0;
178
179
1.48k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
1.48k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
1.48k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
1.48k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
1.48k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
1.48k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
1.48k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
1.48k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
1.48k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
1.48k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
1.48k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
1.48k
    constexpr bool READ_32_BITS =
203
1.48k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
1.48k
    if (READ_32_BITS) {
206
1.48k
        uint32_t word = in[FIRST_WORD_IDX];
207
1.48k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
1.48k
        return word & mask;
209
1.48k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
1.48k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi14ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi14ELi23ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
97
        uint32_t word = in[FIRST_WORD_IDX];
207
97
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
97
        return word & mask;
209
97
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi22ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
97
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
97
    word >>= FIRST_BIT_OFFSET;
213
214
97
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
97
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi21ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
97
        uint32_t word = in[FIRST_WORD_IDX];
207
97
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
97
        return word & mask;
209
97
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi20ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
97
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
97
    word >>= FIRST_BIT_OFFSET;
213
214
97
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
97
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi19ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
97
        uint32_t word = in[FIRST_WORD_IDX];
207
97
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
97
        return word & mask;
209
97
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi18ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
97
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
97
    word >>= FIRST_BIT_OFFSET;
213
214
97
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
97
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi17ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
97
        uint32_t word = in[FIRST_WORD_IDX];
207
97
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
97
        return word & mask;
209
97
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi16ELb0EEEmPKh
Line
Count
Source
176
97
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
97
    if (BIT_WIDTH == 0) return 0;
178
179
97
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
97
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
97
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
97
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
97
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
97
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
97
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
97
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
97
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
97
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
97
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
97
    constexpr bool READ_32_BITS =
203
97
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
97
    if (READ_32_BITS) {
206
97
        uint32_t word = in[FIRST_WORD_IDX];
207
97
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
97
        return word & mask;
209
97
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
97
}
_ZN5doris11UnpackValueILi14ELi15ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi14ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi13ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
99
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
99
    word >>= FIRST_BIT_OFFSET;
213
214
99
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
99
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi12ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi11ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
99
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
99
    word >>= FIRST_BIT_OFFSET;
213
214
99
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
99
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi10ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi9ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
99
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
99
    word >>= FIRST_BIT_OFFSET;
213
214
99
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
99
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi8ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi7ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi6ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
99
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
99
    word >>= FIRST_BIT_OFFSET;
213
214
99
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
99
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi5ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi4ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
99
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
99
    word >>= FIRST_BIT_OFFSET;
213
214
99
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
99
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi3ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi2ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
0
        uint32_t word = in[FIRST_WORD_IDX];
207
0
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
0
        return word & mask;
209
0
    }
210
211
99
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
99
    word >>= FIRST_BIT_OFFSET;
213
214
99
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
99
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi1ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
_ZN5doris11UnpackValueILi14ELi0ELb0EEEmPKh
Line
Count
Source
176
99
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
99
    if (BIT_WIDTH == 0) return 0;
178
179
99
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
99
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
99
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
99
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
99
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
99
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
99
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
99
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
99
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
99
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
99
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
99
    constexpr bool READ_32_BITS =
203
99
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
99
    if (READ_32_BITS) {
206
99
        uint32_t word = in[FIRST_WORD_IDX];
207
99
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
99
        return word & mask;
209
99
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
99
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi15ELi0ELb0EEEmPKh
_ZN5doris11UnpackValueILi16ELi0ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi1ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi2ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi3ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi4ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi5ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi6ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi7ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi8ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi9ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi10ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi11ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi12ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi13ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi14ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi15ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi16ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi17ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi18ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi19ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi20ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi21ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi22ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi23ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi24ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi25ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi26ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi27ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi28ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi29ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi30ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
_ZN5doris11UnpackValueILi16ELi31ELb1EEEmPKh
Line
Count
Source
176
4.18k
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
4.18k
    if (BIT_WIDTH == 0) return 0;
178
179
4.18k
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
4.18k
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
4.18k
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
4.18k
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
4.18k
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
4.18k
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
4.18k
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
4.18k
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
4.18k
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
4.18k
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
4.18k
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
4.18k
    constexpr bool READ_32_BITS =
203
4.18k
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
4.18k
    if (READ_32_BITS) {
206
4.18k
        uint32_t word = in[FIRST_WORD_IDX];
207
4.18k
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
4.18k
        return word & mask;
209
4.18k
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
4.18k
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi16ELi24ELb0EEEmPKh
_ZN5doris11UnpackValueILi16ELi23ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi22ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi21ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi20ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi19ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi18ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi17ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi16ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi15ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi14ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi13ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi12ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi11ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi10ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi9ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi8ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi7ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi6ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi5ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi4ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi3ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi2ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi1ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
_ZN5doris11UnpackValueILi16ELi0ELb0EEEmPKh
Line
Count
Source
176
280
uint64_t NO_SANITIZE_UNDEFINED UnpackValue(const uint8_t* __restrict__ in_buf) {
177
280
    if (BIT_WIDTH == 0) return 0;
178
179
280
    constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
180
280
    constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
181
280
    constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
182
280
    constexpr int LAST_WORD_IDX = BitUtil::round_up_numi32(LAST_BIT_IDX);
183
280
    constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
184
280
    static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");
185
186
280
    constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
187
280
    constexpr uint64_t mask = GetMask(BIT_WIDTH);
188
280
    const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
189
190
    // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
191
    // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
192
    // enough space in the buffer from the current reading point.
193
    // We try to read 64 bits even when it is not necessary because the benchmarks show it
194
    // is faster.
195
280
    constexpr bool CAN_SAFELY_READ_64_BITS =
196
280
            FULL_BATCH && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;
197
198
    // We do not try to read 64 bits when the bit width is a power of two (unless it is
199
    // necessary) because performance benchmarks show that it is better this way. This seems
200
    // to be due to compiler optimisation issues, so we can revisit it when we update the
201
    // compiler version.
202
280
    constexpr bool READ_32_BITS =
203
280
            WORDS_TO_READ == 1 && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));
204
205
280
    if (READ_32_BITS) {
206
280
        uint32_t word = in[FIRST_WORD_IDX];
207
280
        word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
208
280
        return word & mask;
209
280
    }
210
211
0
    uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
212
0
    word >>= FIRST_BIT_OFFSET;
213
214
0
    if (WORDS_TO_READ > 2) {
215
0
        constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
216
0
        uint64_t extra_word = in[FIRST_WORD_IDX + 2];
217
0
        word |= extra_word << USEFUL_BITS;
218
0
    }
219
220
0
    return word & mask;
221
280
}
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi17ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi18ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi19ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi20ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi21ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi22ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi23ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi24ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi25ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi26ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi27ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi28ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi29ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi30ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi31ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi32ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi33ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi34ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi35ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi36ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi37ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi38ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi39ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi40ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi41ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi42ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi43ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi44ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi45ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi46ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi47ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi48ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi49ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi50ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi51ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi52ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi53ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi54ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi55ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi56ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi57ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi58ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi59ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi60ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi61ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi62ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi63ELi0ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi31ELb1EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi30ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi29ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi28ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi27ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi26ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi25ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi24ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi23ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi22ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi21ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi20ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi19ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi18ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi17ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi16ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi15ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi14ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi13ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi12ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi11ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi10ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi9ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi8ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi7ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi6ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi5ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi4ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi3ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi2ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi1ELb0EEEmPKh
Unexecuted instantiation: _ZN5doris11UnpackValueILi64ELi0ELb0EEEmPKh
222
223
template <typename OutType>
224
void DecodeValue(OutType* __restrict__ dict, int64_t dict_len, uint32_t idx,
225
                 OutType* __restrict__ out_val, bool* __restrict__ decode_error) {
226
    if (UNLIKELY(idx >= dict_len)) {
227
        *decode_error = true;
228
    } else {
229
        // Use memcpy() because we can't assume sufficient alignment in some cases (e.g.
230
        // 16 byte decimals).
231
        memcpy(out_val, &dict[idx], sizeof(OutType));
232
    }
233
}
234
235
template <typename OutType, int BIT_WIDTH>
236
const uint8_t* BitPacking::Unpack32Values(const uint8_t* __restrict__ in, int64_t in_bytes,
237
161k
                                          OutType* __restrict__ out) {
238
161k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
161k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
161k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
161k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
161k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
161k
#pragma push_macro("UNPACK_VALUE_CALL")
246
161k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
5.15M
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
5.15M
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
161k
    return in + BYTES_TO_READ;
251
161k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
161k
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi0EEEPKhS3_lPT_
_ZN5doris10BitPacking14Unpack32ValuesIjLi1EEEPKhS3_lPT_
Line
Count
Source
237
8
                                          OutType* __restrict__ out) {
238
8
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
8
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
8
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
8
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
8
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
8
#pragma push_macro("UNPACK_VALUE_CALL")
246
8
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
8
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
8
    return in + BYTES_TO_READ;
251
8
#pragma pop_macro("UNPACK_VALUE_CALL")
252
8
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi2EEEPKhS3_lPT_
Line
Count
Source
237
16
                                          OutType* __restrict__ out) {
238
16
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
16
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
16
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
16
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
16
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
16
#pragma push_macro("UNPACK_VALUE_CALL")
246
16
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
16
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
16
    return in + BYTES_TO_READ;
251
16
#pragma pop_macro("UNPACK_VALUE_CALL")
252
16
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi3EEEPKhS3_lPT_
Line
Count
Source
237
4.00k
                                          OutType* __restrict__ out) {
238
4.00k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
4.00k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
4.00k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
4.00k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
4.00k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
4.00k
#pragma push_macro("UNPACK_VALUE_CALL")
246
4.00k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
4.00k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
4.00k
    return in + BYTES_TO_READ;
251
4.00k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
4.00k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi4EEEPKhS3_lPT_
Line
Count
Source
237
378
                                          OutType* __restrict__ out) {
238
378
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
378
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
378
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
378
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
378
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
378
#pragma push_macro("UNPACK_VALUE_CALL")
246
378
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
378
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
378
    return in + BYTES_TO_READ;
251
378
#pragma pop_macro("UNPACK_VALUE_CALL")
252
378
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi5EEEPKhS3_lPT_
Line
Count
Source
237
3.01k
                                          OutType* __restrict__ out) {
238
3.01k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
3.01k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
3.01k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
3.01k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
3.01k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
3.01k
#pragma push_macro("UNPACK_VALUE_CALL")
246
3.01k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
3.01k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
3.01k
    return in + BYTES_TO_READ;
251
3.01k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
3.01k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi6EEEPKhS3_lPT_
Line
Count
Source
237
586
                                          OutType* __restrict__ out) {
238
586
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
586
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
586
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
586
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
586
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
586
#pragma push_macro("UNPACK_VALUE_CALL")
246
586
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
586
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
586
    return in + BYTES_TO_READ;
251
586
#pragma pop_macro("UNPACK_VALUE_CALL")
252
586
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi7EEEPKhS3_lPT_
Line
Count
Source
237
123
                                          OutType* __restrict__ out) {
238
123
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
123
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
123
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
123
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
123
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
123
#pragma push_macro("UNPACK_VALUE_CALL")
246
123
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
123
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
123
    return in + BYTES_TO_READ;
251
123
#pragma pop_macro("UNPACK_VALUE_CALL")
252
123
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi8EEEPKhS3_lPT_
Line
Count
Source
237
18.7k
                                          OutType* __restrict__ out) {
238
18.7k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
18.7k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
18.7k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
18.7k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
18.7k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
18.7k
#pragma push_macro("UNPACK_VALUE_CALL")
246
18.7k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
18.7k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
18.7k
    return in + BYTES_TO_READ;
251
18.7k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
18.7k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi9EEEPKhS3_lPT_
Line
Count
Source
237
4.83k
                                          OutType* __restrict__ out) {
238
4.83k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
4.83k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
4.83k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
4.83k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
4.83k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
4.83k
#pragma push_macro("UNPACK_VALUE_CALL")
246
4.83k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
4.83k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
4.83k
    return in + BYTES_TO_READ;
251
4.83k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
4.83k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi10EEEPKhS3_lPT_
Line
Count
Source
237
13.2k
                                          OutType* __restrict__ out) {
238
13.2k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
13.2k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
13.2k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
13.2k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
13.2k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
13.2k
#pragma push_macro("UNPACK_VALUE_CALL")
246
13.2k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
13.2k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
13.2k
    return in + BYTES_TO_READ;
251
13.2k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
13.2k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi11EEEPKhS3_lPT_
Line
Count
Source
237
38.1k
                                          OutType* __restrict__ out) {
238
38.1k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
38.1k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
38.1k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
38.1k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
38.1k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
38.1k
#pragma push_macro("UNPACK_VALUE_CALL")
246
38.1k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
38.1k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
38.1k
    return in + BYTES_TO_READ;
251
38.1k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
38.1k
}
_ZN5doris10BitPacking14Unpack32ValuesIjLi12EEEPKhS3_lPT_
Line
Count
Source
237
71.7k
                                          OutType* __restrict__ out) {
238
71.7k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
71.7k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
71.7k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
71.7k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
71.7k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
71.7k
#pragma push_macro("UNPACK_VALUE_CALL")
246
71.7k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
71.7k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
71.7k
    return in + BYTES_TO_READ;
251
71.7k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
71.7k
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi13EEEPKhS3_lPT_
_ZN5doris10BitPacking14Unpack32ValuesIjLi14EEEPKhS3_lPT_
Line
Count
Source
237
1.48k
                                          OutType* __restrict__ out) {
238
1.48k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
1.48k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
1.48k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
1.48k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
1.48k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
1.48k
#pragma push_macro("UNPACK_VALUE_CALL")
246
1.48k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
1.48k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
1.48k
    return in + BYTES_TO_READ;
251
1.48k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
1.48k
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi15EEEPKhS3_lPT_
_ZN5doris10BitPacking14Unpack32ValuesIjLi16EEEPKhS3_lPT_
Line
Count
Source
237
4.18k
                                          OutType* __restrict__ out) {
238
4.18k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
4.18k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
4.18k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
4.18k
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
4.18k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
4.18k
#pragma push_macro("UNPACK_VALUE_CALL")
246
4.18k
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
4.18k
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
4.18k
    return in + BYTES_TO_READ;
251
4.18k
#pragma pop_macro("UNPACK_VALUE_CALL")
252
4.18k
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi17EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi18EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi19EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi20EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi21EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi22EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi23EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi24EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi25EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi26EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi27EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi28EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi29EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi30EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi31EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi32EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi33EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi34EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi35EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi36EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi37EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi38EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi39EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi40EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi41EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi42EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi43EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi44EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi45EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi46EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi47EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi48EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi49EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi50EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi51EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi52EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi53EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi54EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi55EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi56EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi57EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi58EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi59EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi60EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi61EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi62EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi63EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIjLi64EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi0EEEPKhS3_lPT_
_ZN5doris10BitPacking14Unpack32ValuesIhLi1EEEPKhS3_lPT_
Line
Count
Source
237
563
                                          OutType* __restrict__ out) {
238
563
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
239
563
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
240
563
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
241
563
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
242
563
    DCHECK_GE(in_bytes, BYTES_TO_READ);
243
244
    // Call UnpackValue for 0 <= i < 32.
245
563
#pragma push_macro("UNPACK_VALUE_CALL")
246
563
#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
247
563
    out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));
248
249
    BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
250
563
    return in + BYTES_TO_READ;
251
563
#pragma pop_macro("UNPACK_VALUE_CALL")
252
563
}
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi2EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi3EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi4EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi5EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi6EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi7EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi8EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi9EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi10EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi11EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi12EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi13EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi14EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi15EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi16EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi17EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi18EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi19EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi20EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi21EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi22EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi23EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi24EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi25EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi26EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi27EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi28EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi29EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi30EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi31EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi32EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi33EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi34EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi35EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi36EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi37EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi38EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi39EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi40EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi41EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi42EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi43EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi44EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi45EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi46EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi47EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi48EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi49EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi50EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi51EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi52EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi53EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi54EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi55EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi56EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi57EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi58EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi59EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi60EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi61EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi62EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi63EEEPKhS3_lPT_
Unexecuted instantiation: _ZN5doris10BitPacking14Unpack32ValuesIhLi64EEEPKhS3_lPT_
253
254
template <typename OutType>
255
const uint8_t* BitPacking::Unpack32Values(int bit_width, const uint8_t* __restrict__ in,
256
                                          int64_t in_bytes, OutType* __restrict__ out) {
257
#pragma push_macro("UNPACK_VALUES_CASE")
258
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
259
    case i:                                     \
260
        return Unpack32Values<OutType, i>(in, in_bytes, out);
261
262
    switch (bit_width) {
263
        // Expand cases from 0 to 64.
264
        BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
265
    default:
266
        DCHECK(false);
267
        return in;
268
    }
269
#pragma pop_macro("UNPACK_VALUES_CASE")
270
}
271
272
template <typename OutType, int BIT_WIDTH>
273
const uint8_t* BitPacking::UnpackAndDecode32Values(const uint8_t* __restrict__ in, int64_t in_bytes,
274
                                                   OutType* __restrict__ dict, int64_t dict_len,
275
                                                   OutType* __restrict__ out, int64_t stride,
276
                                                   bool* __restrict__ decode_error) {
277
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
278
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
279
    constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
280
    DCHECK_GE(in_bytes, BYTES_TO_READ);
281
    // TODO: this could be optimised further by using SIMD instructions.
282
    // https://lemire.me/blog/2016/08/25/faster-dictionary-decoding-with-simd-instructions/
283
284
    static_assert(BIT_WIDTH <= MAX_DICT_BITWIDTH, "Too high bit width for dictionary index.");
285
286
    // Call UnpackValue() and DecodeValue() for 0 <= i < 32.
287
#pragma push_macro("DECODE_VALUE_CALL")
288
#define DECODE_VALUE_CALL(ignore1, i, ignore2)                                               \
289
    {                                                                                        \
290
        uint32_t idx = UnpackValue<BIT_WIDTH, i, true>(in);                                  \
291
        uint8_t* out_pos = reinterpret_cast<uint8_t*>(out) + i * stride;                     \
292
        DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
293
    }
294
295
    BOOST_PP_REPEAT_FROM_TO(0, 32, DECODE_VALUE_CALL, ignore);
296
    return in + BYTES_TO_READ;
297
#pragma pop_macro("DECODE_VALUE_CALL")
298
}
299
300
template <typename OutType, int BIT_WIDTH>
301
const uint8_t* BitPacking::UnpackUpTo31Values(const uint8_t* __restrict__ in, int64_t in_bytes,
302
11.4k
                                              int num_values, OutType* __restrict__ out) {
303
11.4k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
11.4k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
11.4k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
11.4k
    constexpr int MAX_BATCH_SIZE = 31;
307
11.4k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
11.4k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
11.4k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
11.4k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
11.4k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
11.4k
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
11.4k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
481
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
481
        in_buffer = tmp_buffer;
321
481
    }
322
323
11.4k
#pragma push_macro("UNPACK_VALUES_CASE")
324
11.4k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
265k
    case 31 - i:                                                                              \
326
265k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
265k
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
11.4k
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
265k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
265k
    case 0:
334
11.4k
        break;
335
0
    default:
336
0
        DCHECK(false);
337
11.4k
    }
338
11.4k
    return in + BYTES_TO_READ;
339
11.4k
#pragma pop_macro("UNPACK_VALUES_CASE")
340
11.4k
}
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi0EEEPKhS3_liPT_
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi1EEEPKhS3_liPT_
Line
Count
Source
302
74
                                              int num_values, OutType* __restrict__ out) {
303
74
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
74
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
74
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
74
    constexpr int MAX_BATCH_SIZE = 31;
307
74
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
74
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
74
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
74
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
74
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
74
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
74
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
36
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
36
        in_buffer = tmp_buffer;
321
36
    }
322
323
74
#pragma push_macro("UNPACK_VALUES_CASE")
324
74
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
74
    case 31 - i:                                                                              \
326
74
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
74
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
74
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
912
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
912
    case 0:
334
74
        break;
335
0
    default:
336
0
        DCHECK(false);
337
74
    }
338
74
    return in + BYTES_TO_READ;
339
74
#pragma pop_macro("UNPACK_VALUES_CASE")
340
74
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi2EEEPKhS3_liPT_
Line
Count
Source
302
124
                                              int num_values, OutType* __restrict__ out) {
303
124
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
124
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
124
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
124
    constexpr int MAX_BATCH_SIZE = 31;
307
124
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
124
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
124
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
124
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
124
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
124
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
124
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
74
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
74
        in_buffer = tmp_buffer;
321
74
    }
322
323
124
#pragma push_macro("UNPACK_VALUES_CASE")
324
124
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
124
    case 31 - i:                                                                              \
326
124
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
124
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
124
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
1.41k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
1.41k
    case 0:
334
124
        break;
335
0
    default:
336
0
        DCHECK(false);
337
124
    }
338
124
    return in + BYTES_TO_READ;
339
124
#pragma pop_macro("UNPACK_VALUES_CASE")
340
124
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi3EEEPKhS3_liPT_
Line
Count
Source
302
465
                                              int num_values, OutType* __restrict__ out) {
303
465
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
465
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
465
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
465
    constexpr int MAX_BATCH_SIZE = 31;
307
465
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
465
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
465
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
465
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
465
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
465
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
465
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
161
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
161
        in_buffer = tmp_buffer;
321
161
    }
322
323
465
#pragma push_macro("UNPACK_VALUES_CASE")
324
465
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
465
    case 31 - i:                                                                              \
326
465
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
465
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
465
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
8.52k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
8.52k
    case 0:
334
465
        break;
335
0
    default:
336
0
        DCHECK(false);
337
465
    }
338
465
    return in + BYTES_TO_READ;
339
465
#pragma pop_macro("UNPACK_VALUES_CASE")
340
465
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi4EEEPKhS3_liPT_
Line
Count
Source
302
194
                                              int num_values, OutType* __restrict__ out) {
303
194
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
194
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
194
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
194
    constexpr int MAX_BATCH_SIZE = 31;
307
194
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
194
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
194
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
194
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
194
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
194
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
194
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
0
        in_buffer = tmp_buffer;
321
0
    }
322
323
194
#pragma push_macro("UNPACK_VALUES_CASE")
324
194
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
194
    case 31 - i:                                                                              \
326
194
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
194
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
194
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
3.27k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
3.27k
    case 0:
334
194
        break;
335
0
    default:
336
0
        DCHECK(false);
337
194
    }
338
194
    return in + BYTES_TO_READ;
339
194
#pragma pop_macro("UNPACK_VALUES_CASE")
340
194
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi5EEEPKhS3_liPT_
Line
Count
Source
302
276
                                              int num_values, OutType* __restrict__ out) {
303
276
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
276
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
276
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
276
    constexpr int MAX_BATCH_SIZE = 31;
307
276
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
276
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
276
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
276
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
276
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
276
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
276
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
70
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
70
        in_buffer = tmp_buffer;
321
70
    }
322
323
276
#pragma push_macro("UNPACK_VALUES_CASE")
324
276
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
276
    case 31 - i:                                                                              \
326
276
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
276
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
276
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
6.12k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
6.12k
    case 0:
334
276
        break;
335
0
    default:
336
0
        DCHECK(false);
337
276
    }
338
276
    return in + BYTES_TO_READ;
339
276
#pragma pop_macro("UNPACK_VALUES_CASE")
340
276
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi6EEEPKhS3_liPT_
Line
Count
Source
302
76
                                              int num_values, OutType* __restrict__ out) {
303
76
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
76
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
76
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
76
    constexpr int MAX_BATCH_SIZE = 31;
307
76
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
76
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
76
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
76
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
76
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
76
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
76
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
41
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
41
        in_buffer = tmp_buffer;
321
41
    }
322
323
76
#pragma push_macro("UNPACK_VALUES_CASE")
324
76
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
76
    case 31 - i:                                                                              \
326
76
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
76
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
76
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
1.27k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
1.27k
    case 0:
334
76
        break;
335
0
    default:
336
0
        DCHECK(false);
337
76
    }
338
76
    return in + BYTES_TO_READ;
339
76
#pragma pop_macro("UNPACK_VALUES_CASE")
340
76
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi7EEEPKhS3_liPT_
Line
Count
Source
302
41
                                              int num_values, OutType* __restrict__ out) {
303
41
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
41
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
41
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
41
    constexpr int MAX_BATCH_SIZE = 31;
307
41
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
41
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
41
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
41
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
41
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
41
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
41
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
41
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
41
        in_buffer = tmp_buffer;
321
41
    }
322
323
41
#pragma push_macro("UNPACK_VALUES_CASE")
324
41
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
41
    case 31 - i:                                                                              \
326
41
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
41
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
41
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
328
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
328
    case 0:
334
41
        break;
335
0
    default:
336
0
        DCHECK(false);
337
41
    }
338
41
    return in + BYTES_TO_READ;
339
41
#pragma pop_macro("UNPACK_VALUES_CASE")
340
41
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi8EEEPKhS3_liPT_
Line
Count
Source
302
1.25k
                                              int num_values, OutType* __restrict__ out) {
303
1.25k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
1.25k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
1.25k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
1.25k
    constexpr int MAX_BATCH_SIZE = 31;
307
1.25k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
1.25k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
1.25k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
1.25k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
1.25k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
1.25k
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
1.25k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
0
        in_buffer = tmp_buffer;
321
0
    }
322
323
1.25k
#pragma push_macro("UNPACK_VALUES_CASE")
324
1.25k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
1.25k
    case 31 - i:                                                                              \
326
1.25k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
1.25k
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
1.25k
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
30.0k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
30.0k
    case 0:
334
1.25k
        break;
335
0
    default:
336
0
        DCHECK(false);
337
1.25k
    }
338
1.25k
    return in + BYTES_TO_READ;
339
1.25k
#pragma pop_macro("UNPACK_VALUES_CASE")
340
1.25k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi9EEEPKhS3_liPT_
Line
Count
Source
302
314
                                              int num_values, OutType* __restrict__ out) {
303
314
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
314
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
314
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
314
    constexpr int MAX_BATCH_SIZE = 31;
307
314
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
314
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
314
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
314
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
314
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
314
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
314
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
0
        in_buffer = tmp_buffer;
321
0
    }
322
323
314
#pragma push_macro("UNPACK_VALUES_CASE")
324
314
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
314
    case 31 - i:                                                                              \
326
314
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
314
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
314
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
7.53k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
7.53k
    case 0:
334
314
        break;
335
0
    default:
336
0
        DCHECK(false);
337
314
    }
338
314
    return in + BYTES_TO_READ;
339
314
#pragma pop_macro("UNPACK_VALUES_CASE")
340
314
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi10EEEPKhS3_liPT_
Line
Count
Source
302
884
                                              int num_values, OutType* __restrict__ out) {
303
884
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
884
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
884
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
884
    constexpr int MAX_BATCH_SIZE = 31;
307
884
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
884
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
884
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
884
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
884
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
884
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
884
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
6
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
6
        in_buffer = tmp_buffer;
321
6
    }
322
323
884
#pragma push_macro("UNPACK_VALUES_CASE")
324
884
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
884
    case 31 - i:                                                                              \
326
884
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
884
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
884
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
21.1k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
21.1k
    case 0:
334
884
        break;
335
0
    default:
336
0
        DCHECK(false);
337
884
    }
338
884
    return in + BYTES_TO_READ;
339
884
#pragma pop_macro("UNPACK_VALUES_CASE")
340
884
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi11EEEPKhS3_liPT_
Line
Count
Source
302
2.51k
                                              int num_values, OutType* __restrict__ out) {
303
2.51k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
2.51k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
2.51k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
2.51k
    constexpr int MAX_BATCH_SIZE = 31;
307
2.51k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
2.51k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
2.51k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
2.51k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
2.51k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
2.51k
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
2.51k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
10
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
10
        in_buffer = tmp_buffer;
321
10
    }
322
323
2.51k
#pragma push_macro("UNPACK_VALUES_CASE")
324
2.51k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
2.51k
    case 31 - i:                                                                              \
326
2.51k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
2.51k
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
2.51k
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
60.2k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
60.2k
    case 0:
334
2.51k
        break;
335
0
    default:
336
0
        DCHECK(false);
337
2.51k
    }
338
2.51k
    return in + BYTES_TO_READ;
339
2.51k
#pragma pop_macro("UNPACK_VALUES_CASE")
340
2.51k
}
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi12EEEPKhS3_liPT_
Line
Count
Source
302
4.78k
                                              int num_values, OutType* __restrict__ out) {
303
4.78k
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
4.78k
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
4.78k
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
4.78k
    constexpr int MAX_BATCH_SIZE = 31;
307
4.78k
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
4.78k
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
4.78k
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
4.78k
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
4.78k
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
4.78k
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
4.78k
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
0
        in_buffer = tmp_buffer;
321
0
    }
322
323
4.78k
#pragma push_macro("UNPACK_VALUES_CASE")
324
4.78k
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
4.78k
    case 31 - i:                                                                              \
326
4.78k
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
4.78k
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
4.78k
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
114k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
114k
    case 0:
334
4.78k
        break;
335
0
    default:
336
0
        DCHECK(false);
337
4.78k
    }
338
4.78k
    return in + BYTES_TO_READ;
339
4.78k
#pragma pop_macro("UNPACK_VALUES_CASE")
340
4.78k
}
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi13EEEPKhS3_liPT_
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi14EEEPKhS3_liPT_
Line
Count
Source
302
99
                                              int num_values, OutType* __restrict__ out) {
303
99
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
99
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
99
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
99
    constexpr int MAX_BATCH_SIZE = 31;
307
99
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
99
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
99
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
99
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
99
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
99
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
99
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
1
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
1
        in_buffer = tmp_buffer;
321
1
    }
322
323
99
#pragma push_macro("UNPACK_VALUES_CASE")
324
99
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
99
    case 31 - i:                                                                              \
326
99
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
99
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
99
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
2.36k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
2.36k
    case 0:
334
99
        break;
335
0
    default:
336
0
        DCHECK(false);
337
99
    }
338
99
    return in + BYTES_TO_READ;
339
99
#pragma pop_macro("UNPACK_VALUES_CASE")
340
99
}
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi15EEEPKhS3_liPT_
_ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi16EEEPKhS3_liPT_
Line
Count
Source
302
280
                                              int num_values, OutType* __restrict__ out) {
303
280
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
280
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
280
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
280
    constexpr int MAX_BATCH_SIZE = 31;
307
280
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
280
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
280
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
280
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
280
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
280
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
280
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
0
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
0
        in_buffer = tmp_buffer;
321
0
    }
322
323
280
#pragma push_macro("UNPACK_VALUES_CASE")
324
280
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
280
    case 31 - i:                                                                              \
326
280
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
280
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
280
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
6.72k
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
6.72k
    case 0:
334
280
        break;
335
0
    default:
336
0
        DCHECK(false);
337
280
    }
338
280
    return in + BYTES_TO_READ;
339
280
#pragma pop_macro("UNPACK_VALUES_CASE")
340
280
}
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi17EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi18EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi19EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi20EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi21EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi22EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi23EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi24EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi25EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi26EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi27EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi28EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi29EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi30EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi31EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi32EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi33EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi34EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi35EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi36EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi37EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi38EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi39EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi40EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi41EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi42EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi43EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi44EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi45EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi46EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi47EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi48EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi49EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi50EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi51EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi52EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi53EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi54EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi55EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi56EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi57EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi58EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi59EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi60EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi61EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi62EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi63EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIjLi64EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi0EEEPKhS3_liPT_
_ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi1EEEPKhS3_liPT_
Line
Count
Source
302
41
                                              int num_values, OutType* __restrict__ out) {
303
41
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
304
41
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
305
41
    DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
306
41
    constexpr int MAX_BATCH_SIZE = 31;
307
41
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
308
41
    DCHECK_GE(in_bytes, BYTES_TO_READ);
309
41
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
310
311
    // Make sure the buffer is at least 1 byte.
312
41
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
313
41
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
314
315
41
    const uint8_t* in_buffer = in;
316
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
317
    // last 32-bit load would go past the end of the buffer.
318
41
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
319
41
        memcpy(tmp_buffer, in, BYTES_TO_READ);
320
41
        in_buffer = tmp_buffer;
321
41
    }
322
323
41
#pragma push_macro("UNPACK_VALUES_CASE")
324
41
#define UNPACK_VALUES_CASE(ignore1, i, ignore2)                                               \
325
41
    case 31 - i:                                                                              \
326
41
        out[30 - i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer)); \
327
41
        [[fallthrough]];
328
329
    // Use switch with fall-through cases to minimise branching.
330
41
    switch (num_values) {
331
        // Expand cases from 31 down to 1.
332
632
        BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
333
632
    case 0:
334
41
        break;
335
0
    default:
336
0
        DCHECK(false);
337
41
    }
338
41
    return in + BYTES_TO_READ;
339
41
#pragma pop_macro("UNPACK_VALUES_CASE")
340
41
}
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi2EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi3EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi4EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi5EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi6EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi7EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi8EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi9EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi10EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi11EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi12EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi13EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi14EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi15EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi16EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi17EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi18EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi19EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi20EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi21EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi22EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi23EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi24EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi25EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi26EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi27EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi28EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi29EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi30EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi31EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi32EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi33EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi34EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi35EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi36EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi37EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi38EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi39EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi40EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi41EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi42EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi43EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi44EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi45EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi46EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi47EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi48EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi49EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi50EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi51EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi52EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi53EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi54EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi55EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi56EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi57EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi58EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi59EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi60EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi61EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi62EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi63EEEPKhS3_liPT_
Unexecuted instantiation: _ZN5doris10BitPacking18UnpackUpTo31ValuesIhLi64EEEPKhS3_liPT_
341
342
template <typename OutType, int BIT_WIDTH>
343
const uint8_t* BitPacking::UnpackAndDecodeUpTo31Values(const uint8_t* __restrict__ in,
344
                                                       int64_t in_bytes, OutType* __restrict__ dict,
345
                                                       int64_t dict_len, int num_values,
346
                                                       OutType* __restrict__ out, int64_t stride,
347
                                                       bool* __restrict__ decode_error) {
348
    static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
349
    static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
350
    constexpr int MAX_BATCH_SIZE = 31;
351
    const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
352
    DCHECK_GE(in_bytes, BYTES_TO_READ);
353
    DCHECK_LE(num_values, MAX_BATCH_SIZE);
354
355
    // Make sure the buffer is at least 1 byte.
356
    constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ? (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
357
    uint8_t tmp_buffer[TMP_BUFFER_SIZE];
358
359
    const uint8_t* in_buffer = in;
360
    // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
361
    // last 32-bit load would go past the end of the buffer.
362
    if (BitUtil::round_up(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
363
        memcpy(tmp_buffer, in, BYTES_TO_READ);
364
        in_buffer = tmp_buffer;
365
    }
366
367
#pragma push_macro("DECODE_VALUES_CASE")
368
#define DECODE_VALUES_CASE(ignore1, i, ignore2)                                              \
369
    case 31 - i: {                                                                           \
370
        uint32_t idx = UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer);                     \
371
        uint8_t* out_pos = reinterpret_cast<uint8_t*>(out) + (30 - i) * stride;              \
372
        DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
373
    }
374
375
    // Use switch with fall-through cases to minimise branching.
376
    switch (num_values) {
377
        // Expand cases from 31 down to 1.
378
        BOOST_PP_REPEAT_FROM_TO(0, 31, DECODE_VALUES_CASE, ignore);
379
    case 0:
380
        break;
381
    default:
382
        DCHECK(false);
383
    }
384
    return in + BYTES_TO_READ;
385
#pragma pop_macro("DECODE_VALUES_CASE")
386
}
387
#include "common/compile_check_end.h"
388
} // namespace doris